mojoshader.c
author Ryan C. Gordon <icculus@icculus.org>
Tue, 24 May 2016 01:05:54 -0400
changeset 1160 24ee60c85ec3
parent 1159 d4f97945e2f5
child 1161 6085abb2edaa
permissions -rw-r--r--
metal: Group all uniform arrays into one struct with explicit array sizes. This lets the Metal compiler know more about the data (such as array overflow risks and maybe optimization strategies), but also lets us mush all uniforms into one buffer index instead of splitting them across three.

/**
 * MojoShader; generate shader programs from bytecode of compiled
 *  Direct3D shaders.
 *
 * Please see the file LICENSE.txt in the source's root directory.
 *
 *  This file written by Ryan C. Gordon.
 */

// !!! FIXME: this file really needs to be split up.
// !!! FIXME: I keep changing coding styles for symbols and typedefs.

// !!! FIXME: rules from MSDN about temp registers we probably don't check.
// - There are limited temporaries: vs_1_1 has 12 (ps_1_1 has _2_!).
// - SM2 apparently was variable, between 12 and 32. Shader Model 3 has 32.
// - A maximum of three temp registers can be used in a single instruction.

#define __MOJOSHADER_INTERNAL__ 1
#include "mojoshader_internal.h"

typedef struct ConstantsList
{
    MOJOSHADER_constant constant;
    struct ConstantsList *next;
} ConstantsList;

typedef struct VariableList
{
    MOJOSHADER_uniformType type;
    int index;
    int count;
    ConstantsList *constant;
    int used;
    int emit_position;  // used in some profiles.
    struct VariableList *next;
} VariableList;

typedef struct RegisterList
{
    RegisterType regtype;
    int regnum;
    MOJOSHADER_usage usage;
    unsigned int index;
    int writemask;
    int misc;
    int written;
    const VariableList *array;
    struct RegisterList *next;
} RegisterList;

typedef struct
{
    const uint32 *token;   // this is the unmolested token in the stream.
    int regnum;
    int swizzle;  // xyzw (all four, not split out).
    int swizzle_x;
    int swizzle_y;
    int swizzle_z;
    int swizzle_w;
    SourceMod src_mod;
    RegisterType regtype;
    int relative;
    RegisterType relative_regtype;
    int relative_regnum;
    int relative_component;
    const VariableList *relative_array;
} SourceArgInfo;

struct Profile;  // predeclare.

typedef struct CtabData
{
    int have_ctab;
    int symbol_count;
    MOJOSHADER_symbol *symbols;
} CtabData;

// Context...this is state that changes as we parse through a shader...
typedef struct Context
{
    int isfail;
    int out_of_memory;
    MOJOSHADER_malloc malloc;
    MOJOSHADER_free free;
    void *malloc_data;
    int current_position;
    const uint32 *orig_tokens;
    const uint32 *tokens;
    uint32 tokencount;
    int know_shader_size;
    const MOJOSHADER_swizzle *swizzles;
    unsigned int swizzles_count;
    const MOJOSHADER_samplerMap *samplermap;
    unsigned int samplermap_count;
    Buffer *output;
    Buffer *preflight;
    Buffer *globals;
    Buffer *inputs;
    Buffer *outputs;
    Buffer *helpers;
    Buffer *subroutines;
    Buffer *mainline_intro;
    Buffer *mainline_arguments;
    Buffer *mainline_top;
    Buffer *mainline;
    Buffer *postflight;
    Buffer *ignore;
    Buffer *output_stack[3];
    int indent_stack[3];
    int output_stack_len;
    int indent;
    const char *shader_type_str;
    const char *endline;
    const char *mainfn;
    int endline_len;
    int profileid;
    const struct Profile *profile;
    MOJOSHADER_shaderType shader_type;
    uint8 major_ver;
    uint8 minor_ver;
    DestArgInfo dest_arg;
    SourceArgInfo source_args[5];
    SourceArgInfo predicate_arg;  // for predicated instructions.
    uint32 dwords[4];
    uint32 version_token;
    int instruction_count;
    uint32 instruction_controls;
    uint32 previous_opcode;
    int coissue;
    int loops;
    int reps;
    int max_reps;
    int cmps;
    int scratch_registers;
    int max_scratch_registers;
    int branch_labels_stack_index;
    int branch_labels_stack[32];
    int assigned_branch_labels;
    int assigned_vertex_attributes;
    int last_address_reg_component;
    RegisterList used_registers;
    RegisterList defined_registers;
    ErrorList *errors;
    int constant_count;
    ConstantsList *constants;
    int uniform_count;
    int uniform_float4_count;
    int uniform_int4_count;
    int uniform_bool_count;
    RegisterList uniforms;
    int attribute_count;
    RegisterList attributes;
    int sampler_count;
    RegisterList samplers;
    VariableList *variables;  // variables to register mapping.
    int centroid_allowed;
    CtabData ctab;
    int have_relative_input_registers;
    int have_multi_color_outputs;
    int determined_constants_arrays;
    int predicated;
    int uses_pointsize;
    int uses_fog;

    // !!! FIXME: move these into SUPPORT_PROFILE sections.
    int glsl_generated_lit_helper;
    int glsl_generated_texldd_setup;
    int glsl_generated_texm3x3spec_helper;
    int arb1_wrote_position;
    // !!! FIXME: move these into SUPPORT_PROFILE sections.

    int have_preshader;
    int ignores_ctab;
    int reset_texmpad;
    int texm3x2pad_dst0;
    int texm3x2pad_src0;
    int texm3x3pad_dst0;
    int texm3x3pad_src0;
    int texm3x3pad_dst1;
    int texm3x3pad_src1;
    MOJOSHADER_preshader *preshader;

#if SUPPORT_PROFILE_ARB1_NV
    int profile_supports_nv2;
    int profile_supports_nv3;
    int profile_supports_nv4;
#endif
#if SUPPORT_PROFILE_GLSL120
    int profile_supports_glsl120;
#endif
#if SUPPORT_PROFILE_GLSLES
    int profile_supports_glsles;
#endif

#if SUPPORT_PROFILE_METAL
    int metal_need_header_common;
    int metal_need_header_math;
    int metal_need_header_relational;
    int metal_need_header_geometric;
    int metal_need_header_graphics;
    int metal_need_header_texture;
#endif
} Context;


// Use these macros so we can remove all bits of these profiles from the build.
#if SUPPORT_PROFILE_ARB1_NV
#define support_nv2(ctx) ((ctx)->profile_supports_nv2)
#define support_nv3(ctx) ((ctx)->profile_supports_nv3)
#define support_nv4(ctx) ((ctx)->profile_supports_nv4)
#else
#define support_nv2(ctx) (0)
#define support_nv3(ctx) (0)
#define support_nv4(ctx) (0)
#endif

#if SUPPORT_PROFILE_GLSL120
#define support_glsl120(ctx) ((ctx)->profile_supports_glsl120)
#else
#define support_glsl120(ctx) (0)
#endif

#if SUPPORT_PROFILE_GLSLES
#define support_glsles(ctx) ((ctx)->profile_supports_glsles)
#else
#define support_glsles(ctx) (0)
#endif


// Profile entry points...

// one emit function for each opcode in each profile.
typedef void (*emit_function)(Context *ctx);

// one emit function for starting output in each profile.
typedef void (*emit_start)(Context *ctx, const char *profilestr);

// one emit function for ending output in each profile.
typedef void (*emit_end)(Context *ctx);

// one emit function for phase opcode output in each profile.
typedef void (*emit_phase)(Context *ctx);

// one emit function for finalizing output in each profile.
typedef void (*emit_finalize)(Context *ctx);

// one emit function for global definitions in each profile.
typedef void (*emit_global)(Context *ctx, RegisterType regtype, int regnum);

// one emit function for relative uniform arrays in each profile.
typedef void (*emit_array)(Context *ctx, VariableList *var);

// one emit function for relative constants arrays in each profile.
typedef void (*emit_const_array)(Context *ctx,
                                 const struct ConstantsList *constslist,
                                 int base, int size);

// one emit function for uniforms in each profile.
typedef void (*emit_uniform)(Context *ctx, RegisterType regtype, int regnum,
                             const VariableList *var);

// one emit function for samplers in each profile.
typedef void (*emit_sampler)(Context *ctx, int stage, TextureType ttype,
                             int texbem);

// one emit function for attributes in each profile.
typedef void (*emit_attribute)(Context *ctx, RegisterType regtype, int regnum,
                               MOJOSHADER_usage usage, int index, int wmask,
                               int flags);

// one args function for each possible sequence of opcode arguments.
typedef int (*args_function)(Context *ctx);

// one state function for each opcode where we have state machine updates.
typedef void (*state_function)(Context *ctx);

// one function for varnames in each profile.
typedef const char *(*varname_function)(Context *c, RegisterType t, int num);

// one function for const var array in each profile.
typedef const char *(*const_array_varname_function)(Context *c, int base, int size);

typedef struct Profile
{
    const char *name;
    emit_start start_emitter;
    emit_end end_emitter;
    emit_phase phase_emitter;
    emit_global global_emitter;
    emit_array array_emitter;
    emit_const_array const_array_emitter;
    emit_uniform uniform_emitter;
    emit_sampler sampler_emitter;
    emit_attribute attribute_emitter;
    emit_finalize finalize_emitter;
    varname_function get_varname;
    const_array_varname_function get_const_array_varname;
} Profile;


// !!! FIXME: cut and paste between every damned source file follows...
// !!! FIXME: We need to make some sort of ContextBase that applies to all
// !!! FIXME:  files and move this stuff to mojoshader_common.c ...

static inline void out_of_memory(Context *ctx)
{
    ctx->isfail = ctx->out_of_memory = 1;
} // out_of_memory

static inline void *Malloc(Context *ctx, const size_t len)
{
    void *retval = ctx->malloc((int) len, ctx->malloc_data);
    if (retval == NULL)
        out_of_memory(ctx);
    return retval;
} // Malloc

static inline char *StrDup(Context *ctx, const char *str)
{
    char *retval = (char *) Malloc(ctx, strlen(str) + 1);
    if (retval != NULL)
        strcpy(retval, str);
    return retval;
} // StrDup

static inline void Free(Context *ctx, void *ptr)
{
    ctx->free(ptr, ctx->malloc_data);
} // Free

static void * MOJOSHADERCALL MallocBridge(int bytes, void *data)
{
    return Malloc((Context *) data, (size_t) bytes);
} // MallocBridge

static void MOJOSHADERCALL FreeBridge(void *ptr, void *data)
{
    Free((Context *) data, ptr);
} // FreeBridge


// jump between output sections in the context...

static int set_output(Context *ctx, Buffer **section)
{
    // only create output sections on first use.
    if (*section == NULL)
    {
        *section = buffer_create(256, MallocBridge, FreeBridge, ctx);
        if (*section == NULL)
            return 0;
    } // if

    ctx->output = *section;
    return 1;
} // set_output

static void push_output(Context *ctx, Buffer **section)
{
    assert(ctx->output_stack_len < (int) (STATICARRAYLEN(ctx->output_stack)));
    ctx->output_stack[ctx->output_stack_len] = ctx->output;
    ctx->indent_stack[ctx->output_stack_len] = ctx->indent;
    ctx->output_stack_len++;
    if (!set_output(ctx, section))
        return;
    ctx->indent = 0;
} // push_output

static inline void pop_output(Context *ctx)
{
    assert(ctx->output_stack_len > 0);
    ctx->output_stack_len--;
    ctx->output = ctx->output_stack[ctx->output_stack_len];
    ctx->indent = ctx->indent_stack[ctx->output_stack_len];
} // pop_output



// Shader model version magic...

static inline uint32 ver_ui32(const uint8 major, const uint8 minor)
{
    return ( (((uint32) major) << 16) | (((minor) == 0xFF) ? 1 : (minor)) );
} // version_ui32

static inline int shader_version_supported(const uint8 maj, const uint8 min)
{
    return (ver_ui32(maj,min) <= ver_ui32(MAX_SHADER_MAJOR, MAX_SHADER_MINOR));
} // shader_version_supported

static inline int shader_version_atleast(const Context *ctx, const uint8 maj,
                                         const uint8 min)
{
    return (ver_ui32(ctx->major_ver, ctx->minor_ver) >= ver_ui32(maj, min));
} // shader_version_atleast

static inline int shader_version_exactly(const Context *ctx, const uint8 maj,
                                         const uint8 min)
{
    return ((ctx->major_ver == maj) && (ctx->minor_ver == min));
} // shader_version_exactly

static inline int shader_is_pixel(const Context *ctx)
{
    return (ctx->shader_type == MOJOSHADER_TYPE_PIXEL);
} // shader_is_pixel

static inline int shader_is_vertex(const Context *ctx)
{
    return (ctx->shader_type == MOJOSHADER_TYPE_VERTEX);
} // shader_is_vertex


static inline int isfail(const Context *ctx)
{
    return ctx->isfail;
} // isfail


static void failf(Context *ctx, const char *fmt, ...) ISPRINTF(2,3);
static void failf(Context *ctx, const char *fmt, ...)
{
    ctx->isfail = 1;
    if (ctx->out_of_memory)
        return;

    // no filename at this level (we pass a NULL to errorlist_add_va()...)
    va_list ap;
    va_start(ap, fmt);
    errorlist_add_va(ctx->errors, NULL, ctx->current_position, fmt, ap);
    va_end(ap);
} // failf


static inline void fail(Context *ctx, const char *reason)
{
    failf(ctx, "%s", reason);
} // fail


static void output_line(Context *ctx, const char *fmt, ...) ISPRINTF(2,3);
static void output_line(Context *ctx, const char *fmt, ...)
{
    assert(ctx->output != NULL);
    if (isfail(ctx))
        return;  // we failed previously, don't go on...

    const int indent = ctx->indent;
    if (indent > 0)
    {
        char *indentbuf = (char *) alloca(indent);
        memset(indentbuf, '\t', indent);
        buffer_append(ctx->output, indentbuf, indent);
    } // if

    va_list ap;
    va_start(ap, fmt);
    buffer_append_va(ctx->output, fmt, ap);
    va_end(ap);

    buffer_append(ctx->output, ctx->endline, ctx->endline_len);
} // output_line


static inline void output_blank_line(Context *ctx)
{
    assert(ctx->output != NULL);
    if (!isfail(ctx))
        buffer_append(ctx->output, ctx->endline, ctx->endline_len);
} // output_blank_line


// !!! FIXME: this is sort of nasty.
static void floatstr(Context *ctx, char *buf, size_t bufsize, float f,
                     int leavedecimal)
{
    const size_t len = MOJOSHADER_printFloat(buf, bufsize, f);
    if ((len+2) >= bufsize)
        fail(ctx, "BUG: internal buffer is too small");
    else
    {
        char *end = buf + len;
        char *ptr = strchr(buf, '.');
        if (ptr == NULL)
        {
            if (leavedecimal)
                strcat(buf, ".0");
            return;  // done.
        } // if

        while (--end != ptr)
        {
            if (*end != '0')
            {
                end++;
                break;
            } // if
        } // while
        if ((leavedecimal) && (end == ptr))
            end += 2;
        *end = '\0';  // chop extra '0' or all decimal places off.
    } // else
} // floatstr

static inline TextureType cvtMojoToD3DSamplerType(const MOJOSHADER_samplerType type)
{
    return (TextureType) (((int) type) + 2);
} // cvtMojoToD3DSamplerType

static inline MOJOSHADER_samplerType cvtD3DToMojoSamplerType(const TextureType type)
{
    return (MOJOSHADER_samplerType) (((int) type) - 2);
} // cvtD3DToMojoSamplerType


// Deal with register lists...  !!! FIXME: I sort of hate this.

static void free_reglist(MOJOSHADER_free f, void *d, RegisterList *item)
{
    while (item != NULL)
    {
        RegisterList *next = item->next;
        f(item, d);
        item = next;
    } // while
} // free_reglist

static inline uint32 reg_to_ui32(const RegisterType regtype, const int regnum)
{
    return ( ((uint32) regtype) | (((uint32) regnum) << 16) );
} // reg_to_uint32

// !!! FIXME: ditch this for a hash table.
static RegisterList *reglist_insert(Context *ctx, RegisterList *prev,
                                    const RegisterType regtype,
                                    const int regnum)
{
    const uint32 newval = reg_to_ui32(regtype, regnum);
    RegisterList *item = prev->next;
    while (item != NULL)
    {
        const uint32 val = reg_to_ui32(item->regtype, item->regnum);
        if (newval == val)
            return item;  // already set, so we're done.
        else if (newval < val)  // insert it here.
            break;
        else // if (newval > val)
        {
            // keep going, we're not to the insertion point yet.
            prev = item;
            item = item->next;
        } // else
    } // while

    // we need to insert an entry after (prev).
    item = (RegisterList *) Malloc(ctx, sizeof (RegisterList));
    if (item != NULL)
    {
        item->regtype = regtype;
        item->regnum = regnum;
        item->usage = MOJOSHADER_USAGE_UNKNOWN;
        item->index = 0;
        item->writemask = 0;
        item->misc = 0;
        item->array = NULL;
        item->next = prev->next;
        prev->next = item;
    } // if

    return item;
} // reglist_insert

static RegisterList *reglist_find(const RegisterList *prev,
                                  const RegisterType rtype, const int regnum)
{
    const uint32 newval = reg_to_ui32(rtype, regnum);
    RegisterList *item = prev->next;
    while (item != NULL)
    {
        const uint32 val = reg_to_ui32(item->regtype, item->regnum);
        if (newval == val)
            return item;  // here it is.
        else if (newval < val)  // should have been here if it existed.
            return NULL;
        else // if (newval > val)
            item = item->next;
    } // while

    return NULL;  // wasn't in the list.
} // reglist_find

static inline const RegisterList *reglist_exists(RegisterList *prev,
                                                 const RegisterType regtype,
                                                 const int regnum)
{
    return (reglist_find(prev, regtype, regnum));
} // reglist_exists

static inline int register_was_written(Context *ctx, const RegisterType rtype,
                                       const int regnum)
{
    RegisterList *reg = reglist_find(&ctx->used_registers, rtype, regnum);
    return (reg && reg->written);
} // register_was_written

static inline RegisterList *set_used_register(Context *ctx,
                                              const RegisterType regtype,
                                              const int regnum,
                                              const int written)
{
    RegisterList *reg = NULL;
    if ((regtype == REG_TYPE_COLOROUT) && (regnum > 0))
        ctx->have_multi_color_outputs = 1;

    reg = reglist_insert(ctx, &ctx->used_registers, regtype, regnum);
    if (reg && written)
        reg->written = 1;
    return reg;
} // set_used_register

static inline int get_used_register(Context *ctx, const RegisterType regtype,
                                    const int regnum)
{
    return (reglist_exists(&ctx->used_registers, regtype, regnum) != NULL);
} // get_used_register

static inline void set_defined_register(Context *ctx, const RegisterType rtype,
                                        const int regnum)
{
    reglist_insert(ctx, &ctx->defined_registers, rtype, regnum);
} // set_defined_register

static inline int get_defined_register(Context *ctx, const RegisterType rtype,
                                       const int regnum)
{
    return (reglist_exists(&ctx->defined_registers, rtype, regnum) != NULL);
} // get_defined_register

static void add_attribute_register(Context *ctx, const RegisterType rtype,
                                const int regnum, const MOJOSHADER_usage usage,
                                const int index, const int writemask, int flags)
{
    RegisterList *item = reglist_insert(ctx, &ctx->attributes, rtype, regnum);
    item->usage = usage;
    item->index = index;
    item->writemask = writemask;
    item->misc = flags;

    if ((rtype == REG_TYPE_OUTPUT) && (usage == MOJOSHADER_USAGE_POINTSIZE))
        ctx->uses_pointsize = 1;  // note that we have to check this later.
    else if ((rtype == REG_TYPE_OUTPUT) && (usage == MOJOSHADER_USAGE_FOG))
        ctx->uses_fog = 1;  // note that we have to check this later.
} // add_attribute_register

static inline void add_sampler(Context *ctx, const int regnum,
                               TextureType ttype, const int texbem)
{
    const RegisterType rtype = REG_TYPE_SAMPLER;

    // !!! FIXME: make sure it doesn't exist?
    // !!! FIXME:  (ps_1_1 assume we can add it multiple times...)
    RegisterList *item = reglist_insert(ctx, &ctx->samplers, rtype, regnum);

    if (ctx->samplermap != NULL)
    {
        unsigned int i;
        for (i = 0; i < ctx->samplermap_count; i++)
        {
            if (ctx->samplermap[i].index == regnum)
            {
                ttype = cvtMojoToD3DSamplerType(ctx->samplermap[i].type);
                break;
            } // if
        } // for
    } // if

    item->index = (int) ttype;
    item->misc |= texbem;
} // add_sampler


static inline int writemask_xyzw(const int writemask)
{
    return (writemask == 0xF);  // 0xF == 1111. No explicit mask (full!).
} // writemask_xyzw


static inline int writemask_xyz(const int writemask)
{
    return (writemask == 0x7);  // 0x7 == 0111. (that is: xyz)
} // writemask_xyz


static inline int writemask_xy(const int writemask)
{
    return (writemask == 0x3);  // 0x3 == 0011. (that is: xy)
} // writemask_xy


static inline int writemask_x(const int writemask)
{
    return (writemask == 0x1);  // 0x1 == 0001. (that is: x)
} // writemask_x


static inline int writemask_y(const int writemask)
{
    return (writemask == 0x2);  // 0x1 == 0010. (that is: y)
} // writemask_y


static inline int replicate_swizzle(const int swizzle)
{
    return ( (((swizzle >> 0) & 0x3) == ((swizzle >> 2) & 0x3)) &&
             (((swizzle >> 2) & 0x3) == ((swizzle >> 4) & 0x3)) &&
             (((swizzle >> 4) & 0x3) == ((swizzle >> 6) & 0x3)) );
} // replicate_swizzle


static inline int no_swizzle(const int swizzle)
{
    return (swizzle == 0xE4);  // 0xE4 == 11100100 ... 0 1 2 3. No swizzle.
} // no_swizzle


static inline int vecsize_from_writemask(const int m)
{
    return (m & 1) + ((m >> 1) & 1) + ((m >> 2) & 1) + ((m >> 3) & 1);
} // vecsize_from_writemask


static inline void set_dstarg_writemask(DestArgInfo *dst, const int mask)
{
    dst->writemask = mask;
    dst->writemask0 = ((mask >> 0) & 1);
    dst->writemask1 = ((mask >> 1) & 1);
    dst->writemask2 = ((mask >> 2) & 1);
    dst->writemask3 = ((mask >> 3) & 1);
} // set_dstarg_writemask


static int allocate_scratch_register(Context *ctx)
{
    const int retval = ctx->scratch_registers++;
    if (retval >= ctx->max_scratch_registers)
        ctx->max_scratch_registers = retval + 1;
    return retval;
} // allocate_scratch_register

static int allocate_branch_label(Context *ctx)
{
    return ctx->assigned_branch_labels++;
} // allocate_branch_label

static inline void adjust_token_position(Context *ctx, const int incr)
{
    ctx->tokens += incr;
    ctx->tokencount -= incr;
    ctx->current_position += incr * sizeof (uint32);
} // adjust_token_position


// D3D stuff that's used in more than just the d3d profile...

static int isscalar(Context *ctx, const MOJOSHADER_shaderType shader_type,
                    const RegisterType rtype, const int rnum)
{
    const int uses_psize = ctx->uses_pointsize;
    const int uses_fog = ctx->uses_fog;
    if ( (rtype == REG_TYPE_OUTPUT) && ((uses_psize) || (uses_fog)) )
    {
        const RegisterList *reg = reglist_find(&ctx->attributes, rtype, rnum);
        if (reg != NULL)
        {
            const MOJOSHADER_usage usage = reg->usage;
            return ( (uses_psize && (usage == MOJOSHADER_USAGE_POINTSIZE)) ||
                     (uses_fog && (usage == MOJOSHADER_USAGE_FOG)) );
        } // if
    } // if

    return scalar_register(shader_type, rtype, rnum);
} // isscalar

static const char swizzle_channels[] = { 'x', 'y', 'z', 'w' };


static const char *usagestrs[] = {
    "_position", "_blendweight", "_blendindices", "_normal", "_psize",
    "_texcoord", "_tangent", "_binormal", "_tessfactor", "_positiont",
    "_color", "_fog", "_depth", "_sample"
};

static const char *get_D3D_register_string(Context *ctx,
                                           RegisterType regtype,
                                           int regnum, char *regnum_str,
                                           size_t regnum_size)
{
    const char *retval = NULL;
    int has_number = 1;

    switch (regtype)
    {
        case REG_TYPE_TEMP:
            retval = "r";
            break;

        case REG_TYPE_INPUT:
            retval = "v";
            break;

        case REG_TYPE_CONST:
            retval = "c";
            break;

        case REG_TYPE_ADDRESS:  // (or REG_TYPE_TEXTURE, same value.)
            retval = shader_is_vertex(ctx) ? "a" : "t";
            break;

        case REG_TYPE_RASTOUT:
            switch ((RastOutType) regnum)
            {
                case RASTOUT_TYPE_POSITION: retval = "oPos"; break;
                case RASTOUT_TYPE_FOG: retval = "oFog"; break;
                case RASTOUT_TYPE_POINT_SIZE: retval = "oPts"; break;
            } // switch
            has_number = 0;
            break;

        case REG_TYPE_ATTROUT:
            retval = "oD";
            break;

        case REG_TYPE_OUTPUT: // (or REG_TYPE_TEXCRDOUT, same value.)
            if (shader_is_vertex(ctx) && shader_version_atleast(ctx, 3, 0))
                retval = "o";
            else
                retval = "oT";
            break;

        case REG_TYPE_CONSTINT:
            retval = "i";
            break;

        case REG_TYPE_COLOROUT:
            retval = "oC";
            break;

        case REG_TYPE_DEPTHOUT:
            retval = "oDepth";
            has_number = 0;
            break;

        case REG_TYPE_SAMPLER:
            retval = "s";
            break;

        case REG_TYPE_CONSTBOOL:
            retval = "b";
            break;

        case REG_TYPE_LOOP:
            retval = "aL";
            has_number = 0;
            break;

        case REG_TYPE_MISCTYPE:
            switch ((const MiscTypeType) regnum)
            {
                case MISCTYPE_TYPE_POSITION: retval = "vPos"; break;
                case MISCTYPE_TYPE_FACE: retval = "vFace"; break;
            } // switch
            has_number = 0;
            break;

        case REG_TYPE_LABEL:
            retval = "l";
            break;

        case REG_TYPE_PREDICATE:
            retval = "p";
            break;

        //case REG_TYPE_TEMPFLOAT16:  // !!! FIXME: don't know this asm string
        default:
            fail(ctx, "unknown register type");
            retval = "???";
            has_number = 0;
            break;
    } // switch

    if (has_number)
        snprintf(regnum_str, regnum_size, "%u", (uint) regnum);
    else
        regnum_str[0] = '\0';

    return retval;
} // get_D3D_register_string


// !!! FIXME: can we split the profile code out to separate source files?

#define AT_LEAST_ONE_PROFILE 0

#if !SUPPORT_PROFILE_D3D
#define PROFILE_EMITTER_D3D(op)
#else
#undef AT_LEAST_ONE_PROFILE
#define AT_LEAST_ONE_PROFILE 1
#define PROFILE_EMITTER_D3D(op) emit_D3D_##op,

static const char *make_D3D_srcarg_string_in_buf(Context *ctx,
                                                 const SourceArgInfo *arg,
                                                 char *buf, size_t buflen)
{
    const char *premod_str = "";
    const char *postmod_str = "";
    switch (arg->src_mod)
    {
        case SRCMOD_NEGATE:
            premod_str = "-";
            break;

        case SRCMOD_BIASNEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_BIAS:
            postmod_str = "_bias";
            break;

        case SRCMOD_SIGNNEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_SIGN:
            postmod_str = "_bx2";
            break;

        case SRCMOD_COMPLEMENT:
            premod_str = "1-";
            break;

        case SRCMOD_X2NEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_X2:
            postmod_str = "_x2";
            break;

        case SRCMOD_DZ:
            postmod_str = "_dz";
            break;

        case SRCMOD_DW:
            postmod_str = "_dw";
            break;

        case SRCMOD_ABSNEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_ABS:
            postmod_str = "_abs";
            break;

        case SRCMOD_NOT:
            premod_str = "!";
            break;

        case SRCMOD_NONE:
        case SRCMOD_TOTAL:
             break;  // stop compiler whining.
    } // switch


    char regnum_str[16];
    const char *regtype_str = get_D3D_register_string(ctx, arg->regtype,
                                                      arg->regnum, regnum_str,
                                                      sizeof (regnum_str));

    if (regtype_str == NULL)
    {
        fail(ctx, "Unknown source register type.");
        *buf = '\0';
        return buf;
    } // if

    const char *rel_lbracket = "";
    const char *rel_rbracket = "";
    char rel_swizzle[4] = { '\0' };
    char rel_regnum_str[16] = { '\0' };
    const char *rel_regtype_str = "";
    if (arg->relative)
    {
        rel_swizzle[0] = '.';
        rel_swizzle[1] = swizzle_channels[arg->relative_component];
        rel_swizzle[2] = '\0';
        rel_lbracket = "[";
        rel_rbracket = "]";
        rel_regtype_str = get_D3D_register_string(ctx, arg->relative_regtype,
                                                  arg->relative_regnum,
                                                  rel_regnum_str,
                                                  sizeof (rel_regnum_str));

        if (regtype_str == NULL)
        {
            fail(ctx, "Unknown relative source register type.");
            *buf = '\0';
            return buf;
        } // if
    } // if

    char swizzle_str[6];
    size_t i = 0;
    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
    if (!scalar && !no_swizzle(arg->swizzle))
    {
        swizzle_str[i++] = '.';
        swizzle_str[i++] = swizzle_channels[arg->swizzle_x];
        swizzle_str[i++] = swizzle_channels[arg->swizzle_y];
        swizzle_str[i++] = swizzle_channels[arg->swizzle_z];
        swizzle_str[i++] = swizzle_channels[arg->swizzle_w];

        // .xyzz is the same as .xyz, .z is the same as .zzzz, etc.
        while (swizzle_str[i-1] == swizzle_str[i-2])
            i--;
    } // if
    swizzle_str[i] = '\0';
    assert(i < sizeof (swizzle_str));

    // !!! FIXME: c12[a0.x] actually needs to be c[a0.x + 12]
    snprintf(buf, buflen, "%s%s%s%s%s%s%s%s%s%s",
             premod_str, regtype_str, regnum_str, postmod_str,
             rel_lbracket, rel_regtype_str, rel_regnum_str, rel_swizzle,
             rel_rbracket, swizzle_str);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_D3D_srcarg_string_in_buf


static const char *make_D3D_destarg_string(Context *ctx, char *buf,
                                           const size_t buflen)
{
    const DestArgInfo *arg = &ctx->dest_arg;

    const char *result_shift_str = "";
    switch (arg->result_shift)
    {
        case 0x1: result_shift_str = "_x2"; break;
        case 0x2: result_shift_str = "_x4"; break;
        case 0x3: result_shift_str = "_x8"; break;
        case 0xD: result_shift_str = "_d8"; break;
        case 0xE: result_shift_str = "_d4"; break;
        case 0xF: result_shift_str = "_d2"; break;
    } // switch

    const char *sat_str = (arg->result_mod & MOD_SATURATE) ? "_sat" : "";
    const char *pp_str = (arg->result_mod & MOD_PP) ? "_pp" : "";
    const char *cent_str = (arg->result_mod & MOD_CENTROID) ? "_centroid" : "";

    char regnum_str[16];
    const char *regtype_str = get_D3D_register_string(ctx, arg->regtype,
                                                      arg->regnum, regnum_str,
                                                      sizeof (regnum_str));
    if (regtype_str == NULL)
    {
        fail(ctx, "Unknown destination register type.");
        *buf = '\0';
        return buf;
    } // if

    char writemask_str[6];
    size_t i = 0;
    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
    if (!scalar && !writemask_xyzw(arg->writemask))
    {
        writemask_str[i++] = '.';
        if (arg->writemask0) writemask_str[i++] = 'x';
        if (arg->writemask1) writemask_str[i++] = 'y';
        if (arg->writemask2) writemask_str[i++] = 'z';
        if (arg->writemask3) writemask_str[i++] = 'w';
    } // if
    writemask_str[i] = '\0';
    assert(i < sizeof (writemask_str));

    const char *pred_left = "";
    const char *pred_right = "";
    char pred[32] = { '\0' };
    if (ctx->predicated)
    {
        pred_left = "(";
        pred_right = ") ";
        make_D3D_srcarg_string_in_buf(ctx, &ctx->predicate_arg,
                                      pred, sizeof (pred));
    } // if

    // may turn out something like "_x2_sat_pp_centroid (!p0.x) r0.xyzw" ...
    snprintf(buf, buflen, "%s%s%s%s %s%s%s%s%s%s",
             result_shift_str, sat_str, pp_str, cent_str,
             pred_left, pred, pred_right,
             regtype_str, regnum_str, writemask_str);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_D3D_destarg_string


static const char *make_D3D_srcarg_string(Context *ctx, const size_t idx,
                                          char *buf, size_t buflen)
{
    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        *buf = '\0';
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];
    return make_D3D_srcarg_string_in_buf(ctx, arg, buf, buflen);
} // make_D3D_srcarg_string

static const char *get_D3D_varname_in_buf(Context *ctx, RegisterType rt,
                                           int regnum, char *buf,
                                           const size_t len)
{
    char regnum_str[16];
    const char *regtype_str = get_D3D_register_string(ctx, rt, regnum,
                                              regnum_str, sizeof (regnum_str));
    snprintf(buf,len,"%s%s", regtype_str, regnum_str);
    return buf;
} // get_D3D_varname_in_buf


static const char *get_D3D_varname(Context *ctx, RegisterType rt, int regnum)
{
    char buf[64];
    get_D3D_varname_in_buf(ctx, rt, regnum, buf, sizeof (buf));
    return StrDup(ctx, buf);
} // get_D3D_varname


static const char *get_D3D_const_array_varname(Context *ctx, int base, int size)
{
    char buf[64];
    snprintf(buf, sizeof (buf), "c_array_%d_%d", base, size);
    return StrDup(ctx, buf);
} // get_D3D_const_array_varname


static void emit_D3D_start(Context *ctx, const char *profilestr)
{
    const uint major = (uint) ctx->major_ver;
    const uint minor = (uint) ctx->minor_ver;
    char minor_str[16];

    ctx->ignores_ctab = 1;

    if (minor == 0xFF)
        strcpy(minor_str, "sw");
    else if ((major > 1) && (minor == 1))
        strcpy(minor_str, "x");  // for >= SM2, apparently this is "x". Weird.
    else
        snprintf(minor_str, sizeof (minor_str), "%u", (uint) minor);

    output_line(ctx, "%s_%u_%s", ctx->shader_type_str, major, minor_str);
} // emit_D3D_start


static void emit_D3D_end(Context *ctx)
{
    output_line(ctx, "end");
} // emit_D3D_end


static void emit_D3D_phase(Context *ctx)
{
    output_line(ctx, "phase");
} // emit_D3D_phase


static void emit_D3D_finalize(Context *ctx)
{
    // no-op.
} // emit_D3D_finalize


static void emit_D3D_global(Context *ctx, RegisterType regtype, int regnum)
{
    // no-op.
} // emit_D3D_global


static void emit_D3D_array(Context *ctx, VariableList *var)
{
    // no-op.
} // emit_D3D_array


static void emit_D3D_const_array(Context *ctx, const ConstantsList *clist,
                                 int base, int size)
{
    // no-op.
} // emit_D3D_const_array


static void emit_D3D_uniform(Context *ctx, RegisterType regtype, int regnum,
                             const VariableList *var)
{
    // no-op.
} // emit_D3D_uniform


static void emit_D3D_sampler(Context *ctx, int s, TextureType ttype, int tb)
{
    // no-op.
} // emit_D3D_sampler


static void emit_D3D_attribute(Context *ctx, RegisterType regtype, int regnum,
                               MOJOSHADER_usage usage, int index, int wmask,
                               int flags)
{
    // no-op.
} // emit_D3D_attribute


static void emit_D3D_RESERVED(Context *ctx)
{
    // do nothing; fails in the state machine.
} // emit_D3D_RESERVED


// Generic D3D opcode emitters. A list of macros generate all the entry points
//  that call into these...

static char *lowercase(char *dst, const char *src)
{
    int i = 0;
    do
    {
        const char ch = src[i];
        dst[i] = (((ch >= 'A') && (ch <= 'Z')) ? (ch - ('A' - 'a')) : ch);
    } while (src[i++]);
    return dst;
} // lowercase


static void emit_D3D_opcode_d(Context *ctx, const char *opcode)
{
    char dst[64]; make_D3D_destarg_string(ctx, dst, sizeof (dst));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s%s", ctx->coissue ? "+" : "", opcode, dst);
} // emit_D3D_opcode_d


static void emit_D3D_opcode_s(Context *ctx, const char *opcode)
{
    char src0[64]; make_D3D_srcarg_string(ctx, 0, src0, sizeof (src0));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s %s", ctx->coissue ? "+" : "", opcode, src0);
} // emit_D3D_opcode_s


static void emit_D3D_opcode_ss(Context *ctx, const char *opcode)
{
    char src0[64]; make_D3D_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_D3D_srcarg_string(ctx, 1, src1, sizeof (src1));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s %s, %s", ctx->coissue ? "+" : "", opcode, src0, src1);
} // emit_D3D_opcode_ss


static void emit_D3D_opcode_ds(Context *ctx, const char *opcode)
{
    char dst[64]; make_D3D_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_D3D_srcarg_string(ctx, 0, src0, sizeof (src0));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s%s, %s", ctx->coissue ? "+" : "", opcode, dst, src0);
} // emit_D3D_opcode_ds


static void emit_D3D_opcode_dss(Context *ctx, const char *opcode)
{
    char dst[64]; make_D3D_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_D3D_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_D3D_srcarg_string(ctx, 1, src1, sizeof (src1));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s%s, %s, %s", ctx->coissue ? "+" : "",
                opcode, dst, src0, src1);
} // emit_D3D_opcode_dss


static void emit_D3D_opcode_dsss(Context *ctx, const char *opcode)
{
    char dst[64]; make_D3D_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_D3D_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_D3D_srcarg_string(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_D3D_srcarg_string(ctx, 2, src2, sizeof (src2));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s%s, %s, %s, %s", ctx->coissue ? "+" : "", 
                opcode, dst, src0, src1, src2);
} // emit_D3D_opcode_dsss


static void emit_D3D_opcode_dssss(Context *ctx, const char *opcode)
{
    char dst[64]; make_D3D_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_D3D_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_D3D_srcarg_string(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_D3D_srcarg_string(ctx, 2, src2, sizeof (src2));
    char src3[64]; make_D3D_srcarg_string(ctx, 3, src3, sizeof (src3));
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx,"%s%s%s, %s, %s, %s, %s", ctx->coissue ? "+" : "",
                opcode, dst, src0, src1, src2, src3);
} // emit_D3D_opcode_dssss


static void emit_D3D_opcode(Context *ctx, const char *opcode)
{
    opcode = lowercase((char *) alloca(strlen(opcode) + 1), opcode);
    output_line(ctx, "%s%s", ctx->coissue ? "+" : "", opcode);
} // emit_D3D_opcode


#define EMIT_D3D_OPCODE_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_D_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_d(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_S_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_s(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_SS_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_ss(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_DS_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_ds(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_DSS_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_dss(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_DSSS_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_dsss(ctx, #op); \
    }
#define EMIT_D3D_OPCODE_DSSSS_FUNC(op) \
    static void emit_D3D_##op(Context *ctx) { \
        emit_D3D_opcode_dssss(ctx, #op); \
    }

EMIT_D3D_OPCODE_FUNC(NOP)
EMIT_D3D_OPCODE_DS_FUNC(MOV)
EMIT_D3D_OPCODE_DSS_FUNC(ADD)
EMIT_D3D_OPCODE_DSS_FUNC(SUB)
EMIT_D3D_OPCODE_DSSS_FUNC(MAD)
EMIT_D3D_OPCODE_DSS_FUNC(MUL)
EMIT_D3D_OPCODE_DS_FUNC(RCP)
EMIT_D3D_OPCODE_DS_FUNC(RSQ)
EMIT_D3D_OPCODE_DSS_FUNC(DP3)
EMIT_D3D_OPCODE_DSS_FUNC(DP4)
EMIT_D3D_OPCODE_DSS_FUNC(MIN)
EMIT_D3D_OPCODE_DSS_FUNC(MAX)
EMIT_D3D_OPCODE_DSS_FUNC(SLT)
EMIT_D3D_OPCODE_DSS_FUNC(SGE)
EMIT_D3D_OPCODE_DS_FUNC(EXP)
EMIT_D3D_OPCODE_DS_FUNC(LOG)
EMIT_D3D_OPCODE_DS_FUNC(LIT)
EMIT_D3D_OPCODE_DSS_FUNC(DST)
EMIT_D3D_OPCODE_DSSS_FUNC(LRP)
EMIT_D3D_OPCODE_DS_FUNC(FRC)
EMIT_D3D_OPCODE_DSS_FUNC(M4X4)
EMIT_D3D_OPCODE_DSS_FUNC(M4X3)
EMIT_D3D_OPCODE_DSS_FUNC(M3X4)
EMIT_D3D_OPCODE_DSS_FUNC(M3X3)
EMIT_D3D_OPCODE_DSS_FUNC(M3X2)
EMIT_D3D_OPCODE_S_FUNC(CALL)
EMIT_D3D_OPCODE_SS_FUNC(CALLNZ)
EMIT_D3D_OPCODE_SS_FUNC(LOOP)
EMIT_D3D_OPCODE_FUNC(RET)
EMIT_D3D_OPCODE_FUNC(ENDLOOP)
EMIT_D3D_OPCODE_S_FUNC(LABEL)
EMIT_D3D_OPCODE_DSS_FUNC(POW)
EMIT_D3D_OPCODE_DSS_FUNC(CRS)
EMIT_D3D_OPCODE_DSSS_FUNC(SGN)
EMIT_D3D_OPCODE_DS_FUNC(ABS)
EMIT_D3D_OPCODE_DS_FUNC(NRM)
EMIT_D3D_OPCODE_S_FUNC(REP)
EMIT_D3D_OPCODE_FUNC(ENDREP)
EMIT_D3D_OPCODE_S_FUNC(IF)
EMIT_D3D_OPCODE_FUNC(ELSE)
EMIT_D3D_OPCODE_FUNC(ENDIF)
EMIT_D3D_OPCODE_FUNC(BREAK)
EMIT_D3D_OPCODE_DS_FUNC(MOVA)
EMIT_D3D_OPCODE_D_FUNC(TEXKILL)
EMIT_D3D_OPCODE_DS_FUNC(TEXBEM)
EMIT_D3D_OPCODE_DS_FUNC(TEXBEML)
EMIT_D3D_OPCODE_DS_FUNC(TEXREG2AR)
EMIT_D3D_OPCODE_DS_FUNC(TEXREG2GB)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X2PAD)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X2TEX)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X3PAD)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X3TEX)
EMIT_D3D_OPCODE_DSS_FUNC(TEXM3X3SPEC)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X3VSPEC)
EMIT_D3D_OPCODE_DS_FUNC(EXPP)
EMIT_D3D_OPCODE_DS_FUNC(LOGP)
EMIT_D3D_OPCODE_DSSS_FUNC(CND)
EMIT_D3D_OPCODE_DS_FUNC(TEXREG2RGB)
EMIT_D3D_OPCODE_DS_FUNC(TEXDP3TEX)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X2DEPTH)
EMIT_D3D_OPCODE_DS_FUNC(TEXDP3)
EMIT_D3D_OPCODE_DS_FUNC(TEXM3X3)
EMIT_D3D_OPCODE_D_FUNC(TEXDEPTH)
EMIT_D3D_OPCODE_DSSS_FUNC(CMP)
EMIT_D3D_OPCODE_DSS_FUNC(BEM)
EMIT_D3D_OPCODE_DSSS_FUNC(DP2ADD)
EMIT_D3D_OPCODE_DS_FUNC(DSX)
EMIT_D3D_OPCODE_DS_FUNC(DSY)
EMIT_D3D_OPCODE_DSSSS_FUNC(TEXLDD)
EMIT_D3D_OPCODE_DSS_FUNC(TEXLDL)
EMIT_D3D_OPCODE_S_FUNC(BREAKP)

// special cases for comparison opcodes...
static const char *get_D3D_comparison_string(Context *ctx)
{
    static const char *comps[] = {
        "", "_gt", "_eq", "_ge", "_lt", "_ne", "_le"
    };

    if (ctx->instruction_controls >= STATICARRAYLEN(comps))
    {
        fail(ctx, "unknown comparison control");
        return "";
    } // if

    return comps[ctx->instruction_controls];
} // get_D3D_comparison_string

static void emit_D3D_BREAKC(Context *ctx)
{
    char op[16];
    snprintf(op, sizeof (op), "break%s", get_D3D_comparison_string(ctx));
    emit_D3D_opcode_ss(ctx, op);
} // emit_D3D_BREAKC

static void emit_D3D_IFC(Context *ctx)
{
    char op[16];
    snprintf(op, sizeof (op), "if%s", get_D3D_comparison_string(ctx));
    emit_D3D_opcode_ss(ctx, op);
} // emit_D3D_IFC

static void emit_D3D_SETP(Context *ctx)
{
    char op[16];
    snprintf(op, sizeof (op), "setp%s", get_D3D_comparison_string(ctx));
    emit_D3D_opcode_dss(ctx, op);
} // emit_D3D_SETP

static void emit_D3D_DEF(Context *ctx)
{
    char dst[64];
    make_D3D_destarg_string(ctx, dst, sizeof (dst));
    const float *val = (const float *) ctx->dwords; // !!! FIXME: could be int?
    char val0[32];
    char val1[32];
    char val2[32];
    char val3[32];
    floatstr(ctx, val0, sizeof (val0), val[0], 0);
    floatstr(ctx, val1, sizeof (val1), val[1], 0);
    floatstr(ctx, val2, sizeof (val2), val[2], 0);
    floatstr(ctx, val3, sizeof (val3), val[3], 0);
    output_line(ctx, "def%s, %s, %s, %s, %s", dst, val0, val1, val2, val3);
} // emit_D3D_DEF

static void emit_D3D_DEFI(Context *ctx)
{
    char dst[64];
    make_D3D_destarg_string(ctx, dst, sizeof (dst));
    const int32 *x = (const int32 *) ctx->dwords;
    output_line(ctx, "defi%s, %d, %d, %d, %d", dst,
                (int) x[0], (int) x[1], (int) x[2], (int) x[3]);
} // emit_D3D_DEFI

static void emit_D3D_DEFB(Context *ctx)
{
    char dst[64];
    make_D3D_destarg_string(ctx, dst, sizeof (dst));
    output_line(ctx, "defb%s, %s", dst, ctx->dwords[0] ? "true" : "false");
} // emit_D3D_DEFB


static void emit_D3D_DCL(Context *ctx)
{
    char dst[64];
    make_D3D_destarg_string(ctx, dst, sizeof (dst));
    const DestArgInfo *arg = &ctx->dest_arg;
    const char *usage_str = "";
    char index_str[16] = { '\0' };

    if (arg->regtype == REG_TYPE_SAMPLER)
    {
        switch ((const TextureType) ctx->dwords[0])
        {
            case TEXTURE_TYPE_2D: usage_str = "_2d"; break;
            case TEXTURE_TYPE_CUBE: usage_str = "_cube"; break;
            case TEXTURE_TYPE_VOLUME: usage_str = "_volume"; break;
            default: fail(ctx, "unknown sampler texture type"); return;
        } // switch
    } // if

    else if (arg->regtype == REG_TYPE_MISCTYPE)
    {
        switch ((const MiscTypeType) arg->regnum)
        {
            case MISCTYPE_TYPE_POSITION:
            case MISCTYPE_TYPE_FACE:
                usage_str = "";  // just become "dcl vFace" or whatever.
                break;
            default: fail(ctx, "unknown misc register type"); return;
        } // switch
    } // else if

    else
    {
        const uint32 usage = ctx->dwords[0];
        const uint32 index = ctx->dwords[1];
        usage_str = usagestrs[usage];
        if (index != 0)
            snprintf(index_str, sizeof (index_str), "%u", (uint) index);
    } // else

    output_line(ctx, "dcl%s%s%s", usage_str, index_str, dst);
} // emit_D3D_DCL


static void emit_D3D_TEXCRD(Context *ctx)
{
    // this opcode looks and acts differently depending on the shader model.
    if (shader_version_atleast(ctx, 1, 4))
        emit_D3D_opcode_ds(ctx, "texcrd");
    else
        emit_D3D_opcode_d(ctx, "texcoord");
} // emit_D3D_TEXCOORD

static void emit_D3D_TEXLD(Context *ctx)
{
    // this opcode looks and acts differently depending on the shader model.
    if (shader_version_atleast(ctx, 2, 0))
    {
        if (ctx->instruction_controls == CONTROL_TEXLD)
           emit_D3D_opcode_dss(ctx, "texld");
        else if (ctx->instruction_controls == CONTROL_TEXLDP)
           emit_D3D_opcode_dss(ctx, "texldp");
        else if (ctx->instruction_controls == CONTROL_TEXLDB)
           emit_D3D_opcode_dss(ctx, "texldb");
    } // if

    else if (shader_version_atleast(ctx, 1, 4))
    {
        emit_D3D_opcode_ds(ctx, "texld");
    } // else if

    else
    {
        emit_D3D_opcode_d(ctx, "tex");
    } // else
} // emit_D3D_TEXLD

static void emit_D3D_SINCOS(Context *ctx)
{
    // this opcode needs extra registers for sm2 and lower.
    if (!shader_version_atleast(ctx, 3, 0))
        emit_D3D_opcode_dsss(ctx, "sincos");
    else
        emit_D3D_opcode_ds(ctx, "sincos");
} // emit_D3D_SINCOS


#undef EMIT_D3D_OPCODE_FUNC
#undef EMIT_D3D_OPCODE_D_FUNC
#undef EMIT_D3D_OPCODE_S_FUNC
#undef EMIT_D3D_OPCODE_SS_FUNC
#undef EMIT_D3D_OPCODE_DS_FUNC
#undef EMIT_D3D_OPCODE_DSS_FUNC
#undef EMIT_D3D_OPCODE_DSSS_FUNC
#undef EMIT_D3D_OPCODE_DSSSS_FUNC

#endif  // SUPPORT_PROFILE_D3D


#if !SUPPORT_PROFILE_BYTECODE
#define PROFILE_EMITTER_BYTECODE(op)
#else
#undef AT_LEAST_ONE_PROFILE
#define AT_LEAST_ONE_PROFILE 1
#define PROFILE_EMITTER_BYTECODE(op) emit_BYTECODE_##op,

static void emit_BYTECODE_start(Context *ctx, const char *profilestr)
{
    ctx->ignores_ctab = 1;
} // emit_BYTECODE_start

static void emit_BYTECODE_finalize(Context *ctx)
{
    // just copy the whole token stream and make all other emitters no-ops.
    if (set_output(ctx, &ctx->mainline))
    {
        const size_t len = ((size_t) (ctx->tokens - ctx->orig_tokens)) * sizeof (uint32);
        buffer_append(ctx->mainline, (const char *) ctx->orig_tokens, len);
    } // if
} // emit_BYTECODE_finalize

static void emit_BYTECODE_end(Context *ctx) {}
static void emit_BYTECODE_phase(Context *ctx) {}
static void emit_BYTECODE_global(Context *ctx, RegisterType t, int n) {}
static void emit_BYTECODE_array(Context *ctx, VariableList *var) {}
static void emit_BYTECODE_sampler(Context *c, int s, TextureType t, int tb) {}
static void emit_BYTECODE_const_array(Context *ctx, const ConstantsList *c,
                                         int base, int size) {}
static void emit_BYTECODE_uniform(Context *ctx, RegisterType t, int n,
                                  const VariableList *var) {}
static void emit_BYTECODE_attribute(Context *ctx, RegisterType t, int n,
                                       MOJOSHADER_usage u, int i, int w,
                                       int f) {}

static const char *get_BYTECODE_varname(Context *ctx, RegisterType rt, int regnum)
{
    char regnum_str[16];
    const char *regtype_str = get_D3D_register_string(ctx, rt, regnum,
                                              regnum_str, sizeof (regnum_str));
    char buf[64];
    snprintf(buf, sizeof (buf), "%s%s", regtype_str, regnum_str);
    return StrDup(ctx, buf);
} // get_BYTECODE_varname

static const char *get_BYTECODE_const_array_varname(Context *ctx, int base, int size)
{
    char buf[64];
    snprintf(buf, sizeof (buf), "c_array_%d_%d", base, size);
    return StrDup(ctx, buf);
} // get_BYTECODE_const_array_varname

#define EMIT_BYTECODE_OPCODE_FUNC(op) \
    static void emit_BYTECODE_##op(Context *ctx) {}

EMIT_BYTECODE_OPCODE_FUNC(RESERVED)
EMIT_BYTECODE_OPCODE_FUNC(NOP)
EMIT_BYTECODE_OPCODE_FUNC(MOV)
EMIT_BYTECODE_OPCODE_FUNC(ADD)
EMIT_BYTECODE_OPCODE_FUNC(SUB)
EMIT_BYTECODE_OPCODE_FUNC(MAD)
EMIT_BYTECODE_OPCODE_FUNC(MUL)
EMIT_BYTECODE_OPCODE_FUNC(RCP)
EMIT_BYTECODE_OPCODE_FUNC(RSQ)
EMIT_BYTECODE_OPCODE_FUNC(DP3)
EMIT_BYTECODE_OPCODE_FUNC(DP4)
EMIT_BYTECODE_OPCODE_FUNC(MIN)
EMIT_BYTECODE_OPCODE_FUNC(MAX)
EMIT_BYTECODE_OPCODE_FUNC(SLT)
EMIT_BYTECODE_OPCODE_FUNC(SGE)
EMIT_BYTECODE_OPCODE_FUNC(EXP)
EMIT_BYTECODE_OPCODE_FUNC(LOG)
EMIT_BYTECODE_OPCODE_FUNC(LIT)
EMIT_BYTECODE_OPCODE_FUNC(DST)
EMIT_BYTECODE_OPCODE_FUNC(LRP)
EMIT_BYTECODE_OPCODE_FUNC(FRC)
EMIT_BYTECODE_OPCODE_FUNC(M4X4)
EMIT_BYTECODE_OPCODE_FUNC(M4X3)
EMIT_BYTECODE_OPCODE_FUNC(M3X4)
EMIT_BYTECODE_OPCODE_FUNC(M3X3)
EMIT_BYTECODE_OPCODE_FUNC(M3X2)
EMIT_BYTECODE_OPCODE_FUNC(CALL)
EMIT_BYTECODE_OPCODE_FUNC(CALLNZ)
EMIT_BYTECODE_OPCODE_FUNC(LOOP)
EMIT_BYTECODE_OPCODE_FUNC(RET)
EMIT_BYTECODE_OPCODE_FUNC(ENDLOOP)
EMIT_BYTECODE_OPCODE_FUNC(LABEL)
EMIT_BYTECODE_OPCODE_FUNC(POW)
EMIT_BYTECODE_OPCODE_FUNC(CRS)
EMIT_BYTECODE_OPCODE_FUNC(SGN)
EMIT_BYTECODE_OPCODE_FUNC(ABS)
EMIT_BYTECODE_OPCODE_FUNC(NRM)
EMIT_BYTECODE_OPCODE_FUNC(SINCOS)
EMIT_BYTECODE_OPCODE_FUNC(REP)
EMIT_BYTECODE_OPCODE_FUNC(ENDREP)
EMIT_BYTECODE_OPCODE_FUNC(IF)
EMIT_BYTECODE_OPCODE_FUNC(ELSE)
EMIT_BYTECODE_OPCODE_FUNC(ENDIF)
EMIT_BYTECODE_OPCODE_FUNC(BREAK)
EMIT_BYTECODE_OPCODE_FUNC(MOVA)
EMIT_BYTECODE_OPCODE_FUNC(TEXKILL)
EMIT_BYTECODE_OPCODE_FUNC(TEXBEM)
EMIT_BYTECODE_OPCODE_FUNC(TEXBEML)
EMIT_BYTECODE_OPCODE_FUNC(TEXREG2AR)
EMIT_BYTECODE_OPCODE_FUNC(TEXREG2GB)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X2PAD)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X2TEX)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X3PAD)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X3TEX)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X3SPEC)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X3VSPEC)
EMIT_BYTECODE_OPCODE_FUNC(EXPP)
EMIT_BYTECODE_OPCODE_FUNC(LOGP)
EMIT_BYTECODE_OPCODE_FUNC(CND)
EMIT_BYTECODE_OPCODE_FUNC(TEXREG2RGB)
EMIT_BYTECODE_OPCODE_FUNC(TEXDP3TEX)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X2DEPTH)
EMIT_BYTECODE_OPCODE_FUNC(TEXDP3)
EMIT_BYTECODE_OPCODE_FUNC(TEXM3X3)
EMIT_BYTECODE_OPCODE_FUNC(TEXDEPTH)
EMIT_BYTECODE_OPCODE_FUNC(CMP)
EMIT_BYTECODE_OPCODE_FUNC(BEM)
EMIT_BYTECODE_OPCODE_FUNC(DP2ADD)
EMIT_BYTECODE_OPCODE_FUNC(DSX)
EMIT_BYTECODE_OPCODE_FUNC(DSY)
EMIT_BYTECODE_OPCODE_FUNC(TEXLDD)
EMIT_BYTECODE_OPCODE_FUNC(TEXLDL)
EMIT_BYTECODE_OPCODE_FUNC(BREAKP)
EMIT_BYTECODE_OPCODE_FUNC(BREAKC)
EMIT_BYTECODE_OPCODE_FUNC(IFC)
EMIT_BYTECODE_OPCODE_FUNC(SETP)
EMIT_BYTECODE_OPCODE_FUNC(DEF)
EMIT_BYTECODE_OPCODE_FUNC(DEFI)
EMIT_BYTECODE_OPCODE_FUNC(DEFB)
EMIT_BYTECODE_OPCODE_FUNC(DCL)
EMIT_BYTECODE_OPCODE_FUNC(TEXCRD)
EMIT_BYTECODE_OPCODE_FUNC(TEXLD)

#undef EMIT_BYTECODE_OPCODE_FUNC

#endif  // SUPPORT_PROFILE_BYTECODE


#if !SUPPORT_PROFILE_GLSL
#define PROFILE_EMITTER_GLSL(op)
#else
#undef AT_LEAST_ONE_PROFILE
#define AT_LEAST_ONE_PROFILE 1
#define PROFILE_EMITTER_GLSL(op) emit_GLSL_##op,

#define EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(op) \
    static void emit_GLSL_##op(Context *ctx) { \
        fail(ctx, #op " unimplemented in glsl profile"); \
    }

static inline const char *get_GLSL_register_string(Context *ctx,
                        const RegisterType regtype, const int regnum,
                        char *regnum_str, const size_t regnum_size)
{
    // turns out these are identical at the moment.
    return get_D3D_register_string(ctx,regtype,regnum,regnum_str,regnum_size);
} // get_GLSL_register_string

static const char *get_GLSL_uniform_type(Context *ctx, const RegisterType rtype)
{
    switch (rtype)
    {
        case REG_TYPE_CONST: return "vec4";
        case REG_TYPE_CONSTINT: return "ivec4";
        case REG_TYPE_CONSTBOOL: return "bool";
        default: fail(ctx, "BUG: used a uniform we don't know how to define.");
    } // switch

    return NULL;
} // get_GLSL_uniform_type

static const char *get_GLSL_varname_in_buf(Context *ctx, RegisterType rt,
                                           int regnum, char *buf,
                                           const size_t len)
{
    char regnum_str[16];
    const char *regtype_str = get_GLSL_register_string(ctx, rt, regnum,
                                              regnum_str, sizeof (regnum_str));
    snprintf(buf,len,"%s_%s%s", ctx->shader_type_str, regtype_str, regnum_str);
    return buf;
} // get_GLSL_varname_in_buf


static const char *get_GLSL_varname(Context *ctx, RegisterType rt, int regnum)
{
    char buf[64];
    get_GLSL_varname_in_buf(ctx, rt, regnum, buf, sizeof (buf));
    return StrDup(ctx, buf);
} // get_GLSL_varname


static inline const char *get_GLSL_const_array_varname_in_buf(Context *ctx,
                                                const int base, const int size,
                                                char *buf, const size_t buflen)
{
    const char *type = ctx->shader_type_str;
    snprintf(buf, buflen, "%s_const_array_%d_%d", type, base, size);
    return buf;
} // get_GLSL_const_array_varname_in_buf

static const char *get_GLSL_const_array_varname(Context *ctx, int base, int size)
{
    char buf[64];
    get_GLSL_const_array_varname_in_buf(ctx, base, size, buf, sizeof (buf));
    return StrDup(ctx, buf);
} // get_GLSL_const_array_varname


static inline const char *get_GLSL_input_array_varname(Context *ctx,
                                                char *buf, const size_t buflen)
{
    snprintf(buf, buflen, "%s", "vertex_input_array");
    return buf;
} // get_GLSL_input_array_varname


static const char *get_GLSL_uniform_array_varname(Context *ctx,
                                                  const RegisterType regtype,
                                                  char *buf, const size_t len)
{
    const char *shadertype = ctx->shader_type_str;
    const char *type = get_GLSL_uniform_type(ctx, regtype);
    snprintf(buf, len, "%s_uniforms_%s", shadertype, type);
    return buf;
} // get_GLSL_uniform_array_varname

static const char *get_GLSL_destarg_varname(Context *ctx, char *buf, size_t len)
{
    const DestArgInfo *arg = &ctx->dest_arg;
    return get_GLSL_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, len);
} // get_GLSL_destarg_varname

static const char *get_GLSL_srcarg_varname(Context *ctx, const size_t idx,
                                           char *buf, size_t len)
{
    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        *buf = '\0';
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];
    return get_GLSL_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, len);
} // get_GLSL_srcarg_varname


static const char *make_GLSL_destarg_assign(Context *, char *, const size_t,
                                            const char *, ...) ISPRINTF(4,5);

static const char *make_GLSL_destarg_assign(Context *ctx, char *buf,
                                            const size_t buflen,
                                            const char *fmt, ...)
{
    int need_parens = 0;
    const DestArgInfo *arg = &ctx->dest_arg;

    if (arg->writemask == 0)
    {
        *buf = '\0';
        return buf;  // no writemask? It's a no-op.
    } // if

    char clampbuf[32] = { '\0' };
    const char *clampleft = "";
    const char *clampright = "";
    if (arg->result_mod & MOD_SATURATE)
    {
        const int vecsize = vecsize_from_writemask(arg->writemask);
        clampleft = "clamp(";
        if (vecsize == 1)
            clampright = ", 0.0, 1.0)";
        else
        {
            snprintf(clampbuf, sizeof (clampbuf),
                     ", vec%d(0.0), vec%d(1.0))", vecsize, vecsize);
            clampright = clampbuf;
        } // else
    } // if

    // MSDN says MOD_PP is a hint and many implementations ignore it. So do we.

    // CENTROID only allowed in DCL opcodes, which shouldn't come through here.
    assert((arg->result_mod & MOD_CENTROID) == 0);

    if (ctx->predicated)
    {
        fail(ctx, "predicated destinations unsupported");  // !!! FIXME
        *buf = '\0';
        return buf;
    } // if

    char operation[256];
    va_list ap;
    va_start(ap, fmt);
    const int len = vsnprintf(operation, sizeof (operation), fmt, ap);
    va_end(ap);
    if (len >= sizeof (operation))
    {
        fail(ctx, "operation string too large");  // I'm lazy.  :P
        *buf = '\0';
        return buf;
    } // if

    const char *result_shift_str = "";
    switch (arg->result_shift)
    {
        case 0x1: result_shift_str = " * 2.0"; break;
        case 0x2: result_shift_str = " * 4.0"; break;
        case 0x3: result_shift_str = " * 8.0"; break;
        case 0xD: result_shift_str = " / 8.0"; break;
        case 0xE: result_shift_str = " / 4.0"; break;
        case 0xF: result_shift_str = " / 2.0"; break;
    } // switch
    need_parens |= (result_shift_str[0] != '\0');

    char regnum_str[16];
    const char *regtype_str = get_GLSL_register_string(ctx, arg->regtype,
                                                       arg->regnum, regnum_str,
                                                       sizeof (regnum_str));
    char writemask_str[6];
    size_t i = 0;
    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
    if (!scalar && !writemask_xyzw(arg->writemask))
    {
        writemask_str[i++] = '.';
        if (arg->writemask0) writemask_str[i++] = 'x';
        if (arg->writemask1) writemask_str[i++] = 'y';
        if (arg->writemask2) writemask_str[i++] = 'z';
        if (arg->writemask3) writemask_str[i++] = 'w';
    } // if
    writemask_str[i] = '\0';
    assert(i < sizeof (writemask_str));

    const char *leftparen = (need_parens) ? "(" : "";
    const char *rightparen = (need_parens) ? ")" : "";

    snprintf(buf, buflen, "%s_%s%s%s = %s%s%s%s%s%s;",
             ctx->shader_type_str, regtype_str, regnum_str, writemask_str,
             clampleft, leftparen, operation, rightparen, result_shift_str,
             clampright);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_GLSL_destarg_assign


static char *make_GLSL_swizzle_string(char *swiz_str, const size_t strsize,
                                      const int swizzle, const int writemask)
{
    size_t i = 0;
    if ( (!no_swizzle(swizzle)) || (!writemask_xyzw(writemask)) )
    {
        const int writemask0 = (writemask >> 0) & 0x1;
        const int writemask1 = (writemask >> 1) & 0x1;
        const int writemask2 = (writemask >> 2) & 0x1;
        const int writemask3 = (writemask >> 3) & 0x1;

        const int swizzle_x = (swizzle >> 0) & 0x3;
        const int swizzle_y = (swizzle >> 2) & 0x3;
        const int swizzle_z = (swizzle >> 4) & 0x3;
        const int swizzle_w = (swizzle >> 6) & 0x3;

        swiz_str[i++] = '.';
        if (writemask0) swiz_str[i++] = swizzle_channels[swizzle_x];
        if (writemask1) swiz_str[i++] = swizzle_channels[swizzle_y];
        if (writemask2) swiz_str[i++] = swizzle_channels[swizzle_z];
        if (writemask3) swiz_str[i++] = swizzle_channels[swizzle_w];
    } // if
    assert(i < strsize);
    swiz_str[i] = '\0';
    return swiz_str;
} // make_GLSL_swizzle_string


static const char *make_GLSL_srcarg_string(Context *ctx, const size_t idx,
                                           const int writemask, char *buf,
                                           const size_t buflen)
{
    *buf = '\0';

    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];

    const char *premod_str = "";
    const char *postmod_str = "";
    switch (arg->src_mod)
    {
        case SRCMOD_NEGATE:
            premod_str = "-";
            break;

        case SRCMOD_BIASNEGATE:
            premod_str = "-(";
            postmod_str = " - 0.5)";
            break;

        case SRCMOD_BIAS:
            premod_str = "(";
            postmod_str = " - 0.5)";
            break;

        case SRCMOD_SIGNNEGATE:
            premod_str = "-((";
            postmod_str = " - 0.5) * 2.0)";
            break;

        case SRCMOD_SIGN:
            premod_str = "((";
            postmod_str = " - 0.5) * 2.0)";
            break;

        case SRCMOD_COMPLEMENT:
            premod_str = "(1.0 - ";
            postmod_str = ")";
            break;

        case SRCMOD_X2NEGATE:
            premod_str = "-(";
            postmod_str = " * 2.0)";
            break;

        case SRCMOD_X2:
            premod_str = "(";
            postmod_str = " * 2.0)";
            break;

        case SRCMOD_DZ:
            fail(ctx, "SRCMOD_DZ unsupported"); return buf; // !!! FIXME
            postmod_str = "_dz";
            break;

        case SRCMOD_DW:
            fail(ctx, "SRCMOD_DW unsupported"); return buf; // !!! FIXME
            postmod_str = "_dw";
            break;

        case SRCMOD_ABSNEGATE:
            premod_str = "-abs(";
            postmod_str = ")";
            break;

        case SRCMOD_ABS:
            premod_str = "abs(";
            postmod_str = ")";
            break;

        case SRCMOD_NOT:
            premod_str = "!";
            break;

        case SRCMOD_NONE:
        case SRCMOD_TOTAL:
             break;  // stop compiler whining.
    } // switch

    const char *regtype_str = NULL;

    if (!arg->relative)
    {
        regtype_str = get_GLSL_varname_in_buf(ctx, arg->regtype, arg->regnum,
                                              (char *) alloca(64), 64);
    } // if

    const char *rel_lbracket = "";
    char rel_offset[32] = { '\0' };
    const char *rel_rbracket = "";
    char rel_swizzle[4] = { '\0' };
    const char *rel_regtype_str = "";
    if (arg->relative)
    {
        if (arg->regtype == REG_TYPE_INPUT)
            regtype_str=get_GLSL_input_array_varname(ctx,(char*)alloca(64),64);
        else
        {
            assert(arg->regtype == REG_TYPE_CONST);
            const int arrayidx = arg->relative_array->index;
            const int offset = arg->regnum - arrayidx;
            assert(offset >= 0);
            if (arg->relative_array->constant)
            {
                const int arraysize = arg->relative_array->count;
                regtype_str = get_GLSL_const_array_varname_in_buf(ctx,
                                arrayidx, arraysize, (char *) alloca(64), 64);
                if (offset != 0)
                    snprintf(rel_offset, sizeof (rel_offset), "%d + ", offset);
            } // if
            else
            {
                regtype_str = get_GLSL_uniform_array_varname(ctx, arg->regtype,
                                                      (char *) alloca(64), 64);
                if (offset == 0)
                {
                    snprintf(rel_offset, sizeof (rel_offset),
                             "ARRAYBASE_%d + ", arrayidx);
                } // if
                else
                {
                    snprintf(rel_offset, sizeof (rel_offset),
                             "(ARRAYBASE_%d + %d) + ", arrayidx, offset);
                } // else
            } // else
        } // else

        rel_lbracket = "[";

        rel_regtype_str = get_GLSL_varname_in_buf(ctx, arg->relative_regtype,
                                                  arg->relative_regnum,
                                                  (char *) alloca(64), 64);
        rel_swizzle[0] = '.';
        rel_swizzle[1] = swizzle_channels[arg->relative_component];
        rel_swizzle[2] = '\0';
        rel_rbracket = "]";
    } // if

    char swiz_str[6] = { '\0' };
    if (!isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum))
    {
        make_GLSL_swizzle_string(swiz_str, sizeof (swiz_str),
                                 arg->swizzle, writemask);
    } // if

    if (regtype_str == NULL)
    {
        fail(ctx, "Unknown source register type.");
        return buf;
    } // if

    snprintf(buf, buflen, "%s%s%s%s%s%s%s%s%s",
             premod_str, regtype_str, rel_lbracket, rel_offset,
             rel_regtype_str, rel_swizzle, rel_rbracket, swiz_str,
             postmod_str);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_GLSL_srcarg_string

// generate some convenience functions.
#define MAKE_GLSL_SRCARG_STRING_(mask, bitmask) \
    static inline const char *make_GLSL_srcarg_string_##mask(Context *ctx, \
                                                const size_t idx, char *buf, \
                                                const size_t buflen) { \
        return make_GLSL_srcarg_string(ctx, idx, bitmask, buf, buflen); \
    }
MAKE_GLSL_SRCARG_STRING_(x, (1 << 0))
MAKE_GLSL_SRCARG_STRING_(y, (1 << 1))
MAKE_GLSL_SRCARG_STRING_(z, (1 << 2))
MAKE_GLSL_SRCARG_STRING_(w, (1 << 3))
MAKE_GLSL_SRCARG_STRING_(scalar, (1 << 0))
MAKE_GLSL_SRCARG_STRING_(full, 0xF)
MAKE_GLSL_SRCARG_STRING_(masked, ctx->dest_arg.writemask)
MAKE_GLSL_SRCARG_STRING_(vec3, 0x7)
MAKE_GLSL_SRCARG_STRING_(vec2, 0x3)
#undef MAKE_GLSL_SRCARG_STRING_

// special cases for comparison opcodes...

static const char *get_GLSL_comparison_string_scalar(Context *ctx)
{
    static const char *comps[] = { "", ">", "==", ">=", "<", "!=", "<=" };
    if (ctx->instruction_controls >= STATICARRAYLEN(comps))
    {
        fail(ctx, "unknown comparison control");
        return "";
    } // if

    return comps[ctx->instruction_controls];
} // get_GLSL_comparison_string_scalar

static const char *get_GLSL_comparison_string_vector(Context *ctx)
{
    static const char *comps[] = {
        "", "greaterThan", "equal", "greaterThanEqual", "lessThan",
        "notEqual", "lessThanEqual"
    };

    if (ctx->instruction_controls >= STATICARRAYLEN(comps))
    {
        fail(ctx, "unknown comparison control");
        return "";
    } // if

    return comps[ctx->instruction_controls];
} // get_GLSL_comparison_string_vector


static void emit_GLSL_start(Context *ctx, const char *profilestr)
{
    if (!shader_is_vertex(ctx) && !shader_is_pixel(ctx))
    {
        failf(ctx, "Shader type %u unsupported in this profile.",
              (uint) ctx->shader_type);
        return;
    } // if

    else if (strcmp(profilestr, MOJOSHADER_PROFILE_GLSL) == 0)
    {
        // No gl_FragData[] before GLSL 1.10, so we have to force the version.
        push_output(ctx, &ctx->preflight);
        output_line(ctx, "#version 110");
        pop_output(ctx);
    } // else if

    #if SUPPORT_PROFILE_GLSL120
    else if (strcmp(profilestr, MOJOSHADER_PROFILE_GLSL120) == 0)
    {
        ctx->profile_supports_glsl120 = 1;
        push_output(ctx, &ctx->preflight);
        output_line(ctx, "#version 120");
        pop_output(ctx);
    } // else if
    #endif

    #if SUPPORT_PROFILE_GLSLES
    else if (strcmp(profilestr, MOJOSHADER_PROFILE_GLSLES) == 0)
    {
        ctx->profile_supports_glsles = 1;
        push_output(ctx, &ctx->preflight);
        output_line(ctx, "#version 100");
        if (shader_is_vertex(ctx))
            output_line(ctx, "precision highp float;");
        else
            output_line(ctx, "precision mediump float;");
        output_line(ctx, "precision mediump int;");
        output_line(ctx, "varying vec4 v_FrontColor;");
        output_line(ctx, "varying vec4 v_FrontSecondaryColor;");
        output_line(ctx, "varying vec4 v_TexCoord[10];"); // 10 according to SM3
        pop_output(ctx);
    } // else if
    #endif

    else
    {
        failf(ctx, "Profile '%s' unsupported or unknown.", profilestr);
        return;
    } // else

    push_output(ctx, &ctx->mainline_intro);
    output_line(ctx, "void main()");
    output_line(ctx, "{");
    pop_output(ctx);

    set_output(ctx, &ctx->mainline);
    ctx->indent++;
} // emit_GLSL_start

static void emit_GLSL_RET(Context *ctx);
static void emit_GLSL_end(Context *ctx)
{
    // ps_1_* writes color to r0 instead oC0. We move it to the right place.
    // We don't have to worry about a RET opcode messing this up, since
    //  RET isn't available before ps_2_0.
    if (shader_is_pixel(ctx) && !shader_version_atleast(ctx, 2, 0))
    {
        const char *shstr = ctx->shader_type_str;
        set_used_register(ctx, REG_TYPE_COLOROUT, 0, 1);
        output_line(ctx, "%s_oC0 = %s_r0;", shstr, shstr);
    } // if
    else if (shader_is_vertex(ctx))
    {
#ifdef MOJOSHADER_FLIP_RENDERTARGET
        output_line(ctx, "gl_Position.y = gl_Position.y * vpFlip;");
#endif
#ifdef MOJOSHADER_DEPTH_CLIPPING
        output_line(ctx, "gl_Position.z = gl_Position.z * 2.0 - gl_Position.w;");
#endif
    } // else if

    // force a RET opcode if we're at the end of the stream without one.
    if (ctx->previous_opcode != OPCODE_RET)
        emit_GLSL_RET(ctx);
} // emit_GLSL_end

static void emit_GLSL_phase(Context *ctx)
{
    // no-op in GLSL.
} // emit_GLSL_phase

static void output_GLSL_uniform_array(Context *ctx, const RegisterType regtype,
                                      const int size)
{
    if (size > 0)
    {
        char buf[64];
        get_GLSL_uniform_array_varname(ctx, regtype, buf, sizeof (buf));
        const char *typ;
        switch (regtype)
        {
            case REG_TYPE_CONST: typ = "vec4"; break;
            case REG_TYPE_CONSTINT: typ ="ivec4"; break;
            case REG_TYPE_CONSTBOOL: typ = "bool"; break;
            default:
            {
                fail(ctx, "BUG: used a uniform we don't know how to define.");
                return;
            } // default
        } // switch
        output_line(ctx, "uniform %s %s[%d];", typ, buf, size);
    } // if
} // output_GLSL_uniform_array

static void emit_GLSL_finalize(Context *ctx)
{
    // throw some blank lines around to make source more readable.
    push_output(ctx, &ctx->globals);
    output_blank_line(ctx);
    pop_output(ctx);

    // If we had a relative addressing of REG_TYPE_INPUT, we need to build
    //  an array for it at the start of main(). GLSL doesn't let you specify
    //  arrays of attributes.
    //vec4 blah_array[BIGGEST_ARRAY];
    if (ctx->have_relative_input_registers) // !!! FIXME
        fail(ctx, "Relative addressing of input registers not supported.");

    push_output(ctx, &ctx->preflight);
    output_GLSL_uniform_array(ctx, REG_TYPE_CONST, ctx->uniform_float4_count);
    output_GLSL_uniform_array(ctx, REG_TYPE_CONSTINT, ctx->uniform_int4_count);
    output_GLSL_uniform_array(ctx, REG_TYPE_CONSTBOOL, ctx->uniform_bool_count);
#ifdef MOJOSHADER_FLIP_RENDERTARGET
    if (shader_is_vertex(ctx))
        output_line(ctx, "uniform float vpFlip;");
#endif
    pop_output(ctx);
} // emit_GLSL_finalize

static void emit_GLSL_global(Context *ctx, RegisterType regtype, int regnum)
{
    char varname[64];
    get_GLSL_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    push_output(ctx, &ctx->globals);
    switch (regtype)
    {
        case REG_TYPE_ADDRESS:
            if (shader_is_vertex(ctx))
                output_line(ctx, "ivec4 %s;", varname);
            else if (shader_is_pixel(ctx))  // actually REG_TYPE_TEXTURE.
            {
                // We have to map texture registers to temps for ps_1_1, since
                //  they work like temps, initialize with tex coords, and the
                //  ps_1_1 TEX opcode expects to overwrite it.
                if (!shader_version_atleast(ctx, 1, 4))
                {
                    output_line(ctx, "vec4 %s = gl_TexCoord[%d];",
                                varname, regnum);
                } // if
            } // else if
            break;
        case REG_TYPE_PREDICATE:
            output_line(ctx, "bvec4 %s;", varname);
            break;
        case REG_TYPE_TEMP:
            output_line(ctx, "vec4 %s;", varname);
            break;
        case REG_TYPE_LOOP:
            break; // no-op. We declare these in for loops at the moment.
        case REG_TYPE_LABEL:
            break; // no-op. If we see it here, it means we optimized it out.
        default:
            fail(ctx, "BUG: we used a register we don't know how to define.");
            break;
    } // switch
    pop_output(ctx);
} // emit_GLSL_global

static void emit_GLSL_array(Context *ctx, VariableList *var)
{
    // All uniforms (except constant arrays, which only get pushed once at
    //  compile time) are now packed into a single array, so we can batch
    //  the uniform transfers. So this doesn't actually define an array
    //  here; the one, big array is emitted during finalization instead.
    // However, we need to #define the offset into the one, big array here,
    //  and let dereferences use that #define.
    const int base = var->index;
    const int glslbase = ctx->uniform_float4_count;
    push_output(ctx, &ctx->globals);
    output_line(ctx, "#define ARRAYBASE_%d %d", base, glslbase);
    pop_output(ctx);
    var->emit_position = glslbase;
} // emit_GLSL_array

static void emit_GLSL_const_array(Context *ctx, const ConstantsList *clist,
                                  int base, int size)
{
    char varname[64];
    get_GLSL_const_array_varname_in_buf(ctx,base,size,varname,sizeof(varname));

#if 0
    // !!! FIXME: fails on Nvidia's and Apple's GL, even with #version 120.
    // !!! FIXME:  (the 1.20 spec says it should work, though, I think...)
    if (support_glsl120(ctx))
    {
        // GLSL 1.20 can do constant arrays.
        const char *cstr = NULL;
        push_output(ctx, &ctx->globals);
        output_line(ctx, "const vec4 %s[%d] = vec4[%d](", varname, size, size);
        ctx->indent++;

        int i;
        for (i = 0; i < size; i++)
        {
            while (clist->constant.type != MOJOSHADER_UNIFORM_FLOAT)
                clist = clist->next;
            assert(clist->constant.index == (base + i));

            char val0[32];
            char val1[32];
            char val2[32];
            char val3[32];
            floatstr(ctx, val0, sizeof (val0), clist->constant.value.f[0], 1);
            floatstr(ctx, val1, sizeof (val1), clist->constant.value.f[1], 1);
            floatstr(ctx, val2, sizeof (val2), clist->constant.value.f[2], 1);
            floatstr(ctx, val3, sizeof (val3), clist->constant.value.f[3], 1);

            output_line(ctx, "vec4(%s, %s, %s, %s)%s", val0, val1, val2, val3,
                        (i < (size-1)) ? "," : "");

            clist = clist->next;
        } // for

        ctx->indent--;
        output_line(ctx, ");");
        pop_output(ctx);
    } // if

    else
#endif
    {
        // stock GLSL 1.0 can't do constant arrays, so make a uniform array
        //  and have the OpenGL glue assign it at link time. Lame!
        push_output(ctx, &ctx->globals);
        output_line(ctx, "uniform vec4 %s[%d];", varname, size);
        pop_output(ctx);
    } // else
} // emit_GLSL_const_array

static void emit_GLSL_uniform(Context *ctx, RegisterType regtype, int regnum,
                              const VariableList *var)
{
    // Now that we're pushing all the uniforms as one big array, pack these
    //  down, so if we only use register c439, it'll actually map to
    //  glsl_uniforms_vec4[0]. As we push one big array, this will prevent
    //  uploading unused data.

    char varname[64];
    char name[64];
    int index = 0;

    get_GLSL_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    push_output(ctx, &ctx->globals);

    if (var == NULL)
    {
        get_GLSL_uniform_array_varname(ctx, regtype, name, sizeof (name));

        if (regtype == REG_TYPE_CONST)
            index = ctx->uniform_float4_count;
        else if (regtype == REG_TYPE_CONSTINT)
            index = ctx->uniform_int4_count;
        else if (regtype == REG_TYPE_CONSTBOOL)
            index = ctx->uniform_bool_count;
        else  // get_GLSL_uniform_array_varname() would have called fail().
            assert(isfail(ctx));

        output_line(ctx, "#define %s %s[%d]", varname, name, index);
    } // if

    else
    {
        const int arraybase = var->index;
        if (var->constant)
        {
            get_GLSL_const_array_varname_in_buf(ctx, arraybase, var->count,
                                                name, sizeof (name));
            index = (regnum - arraybase);
        } // if
        else
        {
            assert(var->emit_position != -1);
            get_GLSL_uniform_array_varname(ctx, regtype, name, sizeof (name));
            index = (regnum - arraybase) + var->emit_position;
        } // else

        output_line(ctx, "#define %s %s[%d]", varname, name, index);
    } // else

    pop_output(ctx);
} // emit_GLSL_uniform

static void emit_GLSL_sampler(Context *ctx,int stage,TextureType ttype,int tb)
{
    const char *type = "";
    switch (ttype)
    {
        case TEXTURE_TYPE_2D: type = "sampler2D"; break;
        case TEXTURE_TYPE_CUBE: type = "samplerCube"; break;
        case TEXTURE_TYPE_VOLUME: type = "sampler3D"; break;
        default: fail(ctx, "BUG: used a sampler we don't know how to define.");
    } // switch

    char var[64];
    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, stage, var, sizeof (var));

    push_output(ctx, &ctx->globals);
    output_line(ctx, "uniform %s %s;", type, var);
    if (tb)  // This sampler used a ps_1_1 TEXBEM opcode?
    {
        char name[64];
        const int index = ctx->uniform_float4_count;
        ctx->uniform_float4_count += 2;
        get_GLSL_uniform_array_varname(ctx, REG_TYPE_CONST, name, sizeof (name));
        output_line(ctx, "#define %s_texbem %s[%d]", var, name, index);
        output_line(ctx, "#define %s_texbeml %s[%d]", var, name, index+1);
    } // if
    pop_output(ctx);
} // emit_GLSL_sampler

static void emit_GLSL_attribute(Context *ctx, RegisterType regtype, int regnum,
                                MOJOSHADER_usage usage, int index, int wmask,
                                int flags)
{
    // !!! FIXME: this function doesn't deal with write masks at all yet!
    const char *usage_str = NULL;
    const char *arrayleft = "";
    const char *arrayright = "";
    char index_str[16] = { '\0' };
    char var[64];

    get_GLSL_varname_in_buf(ctx, regtype, regnum, var, sizeof (var));

    //assert((flags & MOD_PP) == 0);  // !!! FIXME: is PP allowed?

    if (index != 0)  // !!! FIXME: a lot of these MUST be zero.
        snprintf(index_str, sizeof (index_str), "%u", (uint) index);

    if (shader_is_vertex(ctx))
    {
        // pre-vs3 output registers.
        // these don't ever happen in DCL opcodes, I think. Map to vs_3_*
        //  output registers.
        if (!shader_version_atleast(ctx, 3, 0))
        {
            if (regtype == REG_TYPE_RASTOUT)
            {
                regtype = REG_TYPE_OUTPUT;
                index = regnum;
                switch ((const RastOutType) regnum)
                {
                    case RASTOUT_TYPE_POSITION:
                        usage = MOJOSHADER_USAGE_POSITION;
                        break;
                    case RASTOUT_TYPE_FOG:
                        usage = MOJOSHADER_USAGE_FOG;
                        break;
                    case RASTOUT_TYPE_POINT_SIZE:
                        usage = MOJOSHADER_USAGE_POINTSIZE;
                        break;
                } // switch
            } // if

            else if (regtype == REG_TYPE_ATTROUT)
            {
                regtype = REG_TYPE_OUTPUT;
                usage = MOJOSHADER_USAGE_COLOR;
                index = regnum;
            } // else if

            else if (regtype == REG_TYPE_TEXCRDOUT)
            {
                regtype = REG_TYPE_OUTPUT;
                usage = MOJOSHADER_USAGE_TEXCOORD;
                index = regnum;
            } // else if
        } // if

        // to avoid limitations of various GL entry points for input
        // attributes (glSecondaryColorPointer() can only take 3 component
        // items, glVertexPointer() can't do GL_UNSIGNED_BYTE, many other
        // issues), we set up all inputs as generic vertex attributes, so we
        // can pass data in just about any form, and ignore the built-in GLSL
        // attributes like gl_SecondaryColor. Output needs to use the the
        // built-ins, though, but we don't have to worry about the GL entry
        // point limitations there.

        if (regtype == REG_TYPE_INPUT)
        {
            push_output(ctx, &ctx->globals);
            output_line(ctx, "attribute vec4 %s;", var);
            pop_output(ctx);
        } // if

        else if (regtype == REG_TYPE_OUTPUT)
        {
            switch (usage)
            {
                case MOJOSHADER_USAGE_POSITION:
                    usage_str = "gl_Position";
                    break;
                case MOJOSHADER_USAGE_POINTSIZE:
                    usage_str = "gl_PointSize";
                    break;
                case MOJOSHADER_USAGE_COLOR:
                    index_str[0] = '\0';  // no explicit number.
                    if (index == 0)
                    {
#if SUPPORT_PROFILE_GLSLES
                        if (support_glsles(ctx))
                            usage_str = "v_FrontColor";
                        else
#endif
                        usage_str = "gl_FrontColor";
                    } // if
                    else if (index == 1)
                    {
#if SUPPORT_PROFILE_GLSLES
                        if (support_glsles(ctx))
                            usage_str = "v_FrontSecondaryColor";
                        else
#endif
                        usage_str = "gl_FrontSecondaryColor";
                    } // else if
                    break;
                case MOJOSHADER_USAGE_FOG:
                    usage_str = "gl_FogFragCoord";
                    break;
                case MOJOSHADER_USAGE_TEXCOORD:
                    snprintf(index_str, sizeof (index_str), "%u", (uint) index);
#if SUPPORT_PROFILE_GLSLES
                    if (support_glsles(ctx))
                        usage_str = "v_TexCoord";
                    else
#endif
                    usage_str = "gl_TexCoord";
                    arrayleft = "[";
                    arrayright = "]";
                    break;
                default:
                    // !!! FIXME: we need to deal with some more built-in varyings here.
                    break;
            } // switch

            // !!! FIXME: the #define is a little hacky, but it means we don't
            // !!! FIXME:  have to track these separately if this works.
            push_output(ctx, &ctx->globals);
            // no mapping to built-in var? Just make it a regular global, pray.
            if (usage_str == NULL)
                output_line(ctx, "vec4 %s;", var);
            else
            {
                output_line(ctx, "#define %s %s%s%s%s", var, usage_str,
                            arrayleft, index_str, arrayright);
            } // else
            pop_output(ctx);
        } // else if

        else
        {
            fail(ctx, "unknown vertex shader attribute register");
        } // else
    } // if

    else if (shader_is_pixel(ctx))
    {
        // samplers DCLs get handled in emit_GLSL_sampler().

        if (flags & MOD_CENTROID)  // !!! FIXME
        {
            failf(ctx, "centroid unsupported in %s profile", ctx->profile->name);
            return;
        } // if

        if (regtype == REG_TYPE_COLOROUT)
        {
            if (!ctx->have_multi_color_outputs)
                usage_str = "gl_FragColor";  // maybe faster?
            else
            {
                snprintf(index_str, sizeof (index_str), "%u", (uint) regnum);
                usage_str = "gl_FragData";
                arrayleft = "[";
                arrayright = "]";
            } // else
        } // if

        else if (regtype == REG_TYPE_DEPTHOUT)
            usage_str = "gl_FragDepth";

        // !!! FIXME: can you actualy have a texture register with COLOR usage?
        else if ((regtype == REG_TYPE_TEXTURE) || (regtype == REG_TYPE_INPUT))
        {
            if (usage == MOJOSHADER_USAGE_TEXCOORD)
            {
                // ps_1_1 does a different hack for this attribute.
                //  Refer to emit_GLSL_global()'s REG_TYPE_ADDRESS code.
                if (shader_version_atleast(ctx, 1, 4))
                {
                    snprintf(index_str, sizeof (index_str), "%u", (uint) index);
#if SUPPORT_PROFILE_GLSLES
                    if (support_glsles(ctx))
                        usage_str = "v_TexCoord";
                    else
#endif
                    usage_str = "gl_TexCoord";
                    arrayleft = "[";
                    arrayright = "]";
                } // if
            } // if

            else if (usage == MOJOSHADER_USAGE_COLOR)
            {
                index_str[0] = '\0';  // no explicit number.
                if (index == 0)
                {
#if SUPPORT_PROFILE_GLSLES
                    if (support_glsles(ctx))
                        usage_str = "v_FrontColor";
                    else
#endif
                    usage_str = "gl_Color";
                } // if
                else if (index == 1)
                {
#if SUPPORT_PROFILE_GLSLES
                    if (support_glsles(ctx))
                        usage_str = "v_FrontSecondaryColor";
                    else
#endif
                    usage_str = "gl_SecondaryColor";
                } // else if
                else
                    fail(ctx, "unsupported color index");
            } // else if

            else if (usage == MOJOSHADER_USAGE_DEPTH) // !!! FIXME: Possibly more! -flibit
            {
                push_output(ctx, &ctx->globals);
                output_line(ctx, "attribute vec4 %s;", var);
                pop_output(ctx);
            } // else if
        } // else if

        else if (regtype == REG_TYPE_MISCTYPE)
        {
            const MiscTypeType mt = (MiscTypeType) regnum;
            if (mt == MISCTYPE_TYPE_FACE)
            {
                push_output(ctx, &ctx->globals);
                output_line(ctx, "float %s = gl_FrontFacing ? 1.0 : -1.0;", var);
                pop_output(ctx);
            } // if
            else if (mt == MISCTYPE_TYPE_POSITION)
            {
                index_str[0] = '\0';  // no explicit number.
                usage_str = "gl_FragCoord";  // !!! FIXME: is this the same coord space as D3D?
            } // else if
            else
            {
                fail(ctx, "BUG: unhandled misc register");
            } // else
        } // else if

        else
        {
            fail(ctx, "unknown pixel shader attribute register");
        } // else

        if (usage_str != NULL)
        {
            push_output(ctx, &ctx->globals);
            output_line(ctx, "#define %s %s%s%s%s", var, usage_str,
                        arrayleft, index_str, arrayright);
            pop_output(ctx);
        } // if
    } // else if

    else
    {
        fail(ctx, "Unknown shader type");  // state machine should catch this.
    } // else
} // emit_GLSL_attribute

static void emit_GLSL_NOP(Context *ctx)
{
    // no-op is a no-op.  :)
} // emit_GLSL_NOP

static void emit_GLSL_MOV(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "%s", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_MOV

static void emit_GLSL_ADD(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "%s + %s", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_ADD

static void emit_GLSL_SUB(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "%s - %s", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_SUB

static void emit_GLSL_MAD(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_GLSL_srcarg_string_masked(ctx, 2, src2, sizeof (src2));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "(%s * %s) + %s", src0, src1, src2);
    output_line(ctx, "%s", code);
} // emit_GLSL_MAD

static void emit_GLSL_MUL(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "%s * %s", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_MUL

static void emit_GLSL_RCP(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "1.0 / %s", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_RCP

static void emit_GLSL_RSQ(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "inversesqrt(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_RSQ

static void emit_GLSL_dotprod(Context *ctx, const char *src0, const char *src1,
                              const char *extra)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char castleft[16] = { '\0' };
    const char *castright = "";
    if (vecsize != 1)
    {
        snprintf(castleft, sizeof (castleft), "vec%d(", vecsize);
        castright = ")";
    } // if

    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "%sdot(%s, %s)%s%s",
                             castleft, src0, src1, extra, castright);
    output_line(ctx, "%s", code);
} // emit_GLSL_dotprod

static void emit_GLSL_DP3(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_vec3(ctx, 1, src1, sizeof (src1));
    emit_GLSL_dotprod(ctx, src0, src1, "");
} // emit_GLSL_DP3

static void emit_GLSL_DP4(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_full(ctx, 1, src1, sizeof (src1));
    emit_GLSL_dotprod(ctx, src0, src1, "");
} // emit_GLSL_DP4

static void emit_GLSL_MIN(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "min(%s, %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_MIN

static void emit_GLSL_MAX(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "max(%s, %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_MAX

static void emit_GLSL_SLT(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];

    // float(bool) or vec(bvec) results in 0.0 or 1.0, like SLT wants.
    if (vecsize == 1)
        make_GLSL_destarg_assign(ctx, code, sizeof (code), "float(%s < %s)", src0, src1);
    else
    {
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "vec%d(lessThan(%s, %s))",
                                 vecsize, src0, src1);
    } // else
    output_line(ctx, "%s", code);
} // emit_GLSL_SLT

static void emit_GLSL_SGE(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];

    // float(bool) or vec(bvec) results in 0.0 or 1.0, like SGE wants.
    if (vecsize == 1)
    {
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "float(%s >= %s)", src0, src1);
    } // if
    else
    {
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "vec%d(greaterThanEqual(%s, %s))",
                                 vecsize, src0, src1);
    } // else
    output_line(ctx, "%s", code);
} // emit_GLSL_SGE

static void emit_GLSL_EXP(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "exp2(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_EXP

static void emit_GLSL_LOG(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "log2(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_LOG

static void emit_GLSL_LIT_helper(Context *ctx)
{
    const char *maxp = "127.9961"; // value from the dx9 reference.

    if (ctx->glsl_generated_lit_helper)
        return;

    ctx->glsl_generated_lit_helper = 1;

    push_output(ctx, &ctx->helpers);
    output_line(ctx, "vec4 LIT(const vec4 src)");
    output_line(ctx, "{"); ctx->indent++;
    output_line(ctx,   "float power = clamp(src.w, -%s, %s);",maxp,maxp);
    output_line(ctx,   "vec4 retval = vec4(1.0, 0.0, 0.0, 1.0);");
    output_line(ctx,   "if (src.x > 0.0) {"); ctx->indent++;
    output_line(ctx,     "retval.y = src.x;");
    output_line(ctx,     "if (src.y > 0.0) {"); ctx->indent++;
    output_line(ctx,       "retval.z = pow(src.y, power);"); ctx->indent--;
    output_line(ctx,     "}"); ctx->indent--;
    output_line(ctx,   "}");
    output_line(ctx,   "return retval;"); ctx->indent--;
    output_line(ctx, "}");
    output_blank_line(ctx);
    pop_output(ctx);
} // emit_GLSL_LIT_helper

static void emit_GLSL_LIT(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char code[128];
    emit_GLSL_LIT_helper(ctx);
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "LIT(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_LIT

static void emit_GLSL_DST(Context *ctx)
{
    // !!! FIXME: needs to take ctx->dst_arg.writemask into account.
    char src0_y[64]; make_GLSL_srcarg_string_y(ctx, 0, src0_y, sizeof (src0_y));
    char src1_y[64]; make_GLSL_srcarg_string_y(ctx, 1, src1_y, sizeof (src1_y));
    char src0_z[64]; make_GLSL_srcarg_string_z(ctx, 0, src0_z, sizeof (src0_z));
    char src1_w[64]; make_GLSL_srcarg_string_w(ctx, 1, src1_w, sizeof (src1_w));

    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                             "vec4(1.0, %s * %s, %s, %s)",
                             src0_y, src1_y, src0_z, src1_w);
    output_line(ctx, "%s", code);
} // emit_GLSL_DST

static void emit_GLSL_LRP(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_GLSL_srcarg_string_masked(ctx, 2, src2, sizeof (src2));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "mix(%s, %s, %s)",
                             src2, src1, src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_LRP

static void emit_GLSL_FRC(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "fract(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_FRC

static void emit_GLSL_M4X4(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_GLSL_srcarg_string_full(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_GLSL_srcarg_string_full(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_GLSL_srcarg_string_full(ctx, 3, row2, sizeof (row2));
    char row3[64]; make_GLSL_srcarg_string_full(ctx, 4, row3, sizeof (row3));
    char code[256];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                    "vec4(dot(%s, %s), dot(%s, %s), dot(%s, %s), dot(%s, %s))",
                    src0, row0, src0, row1, src0, row2, src0, row3);
    output_line(ctx, "%s", code);
} // emit_GLSL_M4X4

static void emit_GLSL_M4X3(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_GLSL_srcarg_string_full(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_GLSL_srcarg_string_full(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_GLSL_srcarg_string_full(ctx, 3, row2, sizeof (row2));
    char code[256];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                "vec3(dot(%s, %s), dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1, src0, row2);
    output_line(ctx, "%s", code);
} // emit_GLSL_M4X3

static void emit_GLSL_M3X4(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_GLSL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_GLSL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_GLSL_srcarg_string_vec3(ctx, 3, row2, sizeof (row2));
    char row3[64]; make_GLSL_srcarg_string_vec3(ctx, 4, row3, sizeof (row3));

    char code[256];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                "vec4(dot(%s, %s), dot(%s, %s), "
                                     "dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1,
                                src0, row2, src0, row3);
    output_line(ctx, "%s", code);
} // emit_GLSL_M3X4

static void emit_GLSL_M3X3(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_GLSL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_GLSL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_GLSL_srcarg_string_vec3(ctx, 3, row2, sizeof (row2));
    char code[256];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                "vec3(dot(%s, %s), dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1, src0, row2);
    output_line(ctx, "%s", code);
} // emit_GLSL_M3X3

static void emit_GLSL_M3X2(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_GLSL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_GLSL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));

    char code[256];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                "vec2(dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1);
    output_line(ctx, "%s", code);
} // emit_GLSL_M3X2

static void emit_GLSL_CALL(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    if (ctx->loops > 0)
        output_line(ctx, "%s(aL);", src0);
    else
        output_line(ctx, "%s();", src0);
} // emit_GLSL_CALL

static void emit_GLSL_CALLNZ(Context *ctx)
{
    // !!! FIXME: if src1 is a constbool that's true, we can remove the
    // !!! FIXME:  if. If it's false, we can make this a no-op.
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));

    if (ctx->loops > 0)
        output_line(ctx, "if (%s) { %s(aL); }", src1, src0);
    else
        output_line(ctx, "if (%s) { %s(); }", src1, src0);
} // emit_GLSL_CALLNZ

static void emit_GLSL_LOOP(Context *ctx)
{
    // !!! FIXME: swizzle?
    char var[64]; get_GLSL_srcarg_varname(ctx, 1, var, sizeof (var));
    assert(ctx->source_args[0].regnum == 0);  // in case they add aL1 someday.
    output_line(ctx, "{");
    ctx->indent++;
    output_line(ctx, "const int aLend = %s.x + %s.y;", var, var);
    output_line(ctx, "for (int aL = %s.y; aL < aLend; aL += %s.z) {", var, var);
    ctx->indent++;
} // emit_GLSL_LOOP

static void emit_GLSL_RET(Context *ctx)
{
    // thankfully, the MSDN specs say a RET _has_ to end a function...no
    //  early returns. So if you hit one, you know you can safely close
    //  a high-level function.
    ctx->indent--;
    output_line(ctx, "}");
    output_blank_line(ctx);
    set_output(ctx, &ctx->subroutines);  // !!! FIXME: is this for LABEL? Maybe set it there so we don't allocate unnecessarily.
} // emit_GLSL_RET

static void emit_GLSL_ENDLOOP(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "}");
    ctx->indent--;
    output_line(ctx, "}");
} // emit_GLSL_ENDLOOP

static void emit_GLSL_LABEL(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    const int label = ctx->source_args[0].regnum;
    RegisterList *reg = reglist_find(&ctx->used_registers, REG_TYPE_LABEL, label);
    assert(ctx->output == ctx->subroutines);  // not mainline, etc.
    assert(ctx->indent == 0);  // we shouldn't be in the middle of a function.

    // MSDN specs say CALL* has to come before the LABEL, so we know if we
    //  can ditch the entire function here as unused.
    if (reg == NULL)
        set_output(ctx, &ctx->ignore);  // Func not used. Parse, but don't output.

    // !!! FIXME: it would be nice if we could determine if a function is
    // !!! FIXME:  only called once and, if so, forcibly inline it.

    const char *uses_loopreg = ((reg) && (reg->misc == 1)) ? "int aL" : "";
    output_line(ctx, "void %s(%s)", src0, uses_loopreg);
    output_line(ctx, "{");
    ctx->indent++;
} // emit_GLSL_LABEL

static void emit_GLSL_DCL(Context *ctx)
{
    // no-op. We do this in our emit_attribute() and emit_uniform().
} // emit_GLSL_DCL

static void emit_GLSL_POW(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                             "pow(abs(%s), %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_POW

static void emit_GLSL_CRS(Context *ctx)
{
    // !!! FIXME: needs to take ctx->dst_arg.writemask into account.
    char src0[64]; make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_vec3(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code),
                             "cross(%s, %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_GLSL_CRS

static void emit_GLSL_SGN(Context *ctx)
{
    // (we don't need the temporary registers specified for the D3D opcode.)
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "sign(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_SGN

static void emit_GLSL_ABS(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "abs(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_ABS

static void emit_GLSL_NRM(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "normalize(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_NRM

static void emit_GLSL_SINCOS(Context *ctx)
{
    // we don't care about the temp registers that <= sm2 demands; ignore them.
    //  sm2 also talks about what components are left untouched vs. undefined,
    //  but we just leave those all untouched with GLSL write masks (which
    //  would fulfill the "undefined" requirement, too).
    const int mask = ctx->dest_arg.writemask;
    char src0[64]; make_GLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    char code[128] = { '\0' };

    if (writemask_x(mask))
        make_GLSL_destarg_assign(ctx, code, sizeof (code), "cos(%s)", src0);
    else if (writemask_y(mask))
        make_GLSL_destarg_assign(ctx, code, sizeof (code), "sin(%s)", src0);
    else if (writemask_xy(mask))
    {
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "vec2(cos(%s), sin(%s))", src0, src0);
    } // else if

    output_line(ctx, "%s", code);
} // emit_GLSL_SINCOS

static void emit_GLSL_REP(Context *ctx)
{
    // !!! FIXME:
    // msdn docs say legal loop values are 0 to 255. We can check DEFI values
    //  at parse time, but if they are pulling a value from a uniform, do
    //  we clamp here?
    // !!! FIXME: swizzle is legal here, right?
    char src0[64]; make_GLSL_srcarg_string_x(ctx, 0, src0, sizeof (src0));
    const uint rep = (uint) ctx->reps;
    output_line(ctx, "for (int rep%u = 0; rep%u < %s; rep%u++) {",
                rep, rep, src0, rep);
    ctx->indent++;
} // emit_GLSL_REP

static void emit_GLSL_ENDREP(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "}");
} // emit_GLSL_ENDREP

static void emit_GLSL_IF(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    output_line(ctx, "if (%s) {", src0);
    ctx->indent++;
} // emit_GLSL_IF

static void emit_GLSL_IFC(Context *ctx)
{
    const char *comp = get_GLSL_comparison_string_scalar(ctx);
    char src0[64]; make_GLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_scalar(ctx, 1, src1, sizeof (src1));
    output_line(ctx, "if (%s %s %s) {", src0, comp, src1);
    ctx->indent++;
} // emit_GLSL_IFC

static void emit_GLSL_ELSE(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "} else {");
    ctx->indent++;
} // emit_GLSL_ELSE

static void emit_GLSL_ENDIF(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "}");
} // emit_GLSL_ENDIF

static void emit_GLSL_BREAK(Context *ctx)
{
    output_line(ctx, "break;");
} // emit_GLSL_BREAK

static void emit_GLSL_BREAKC(Context *ctx)
{
    const char *comp = get_GLSL_comparison_string_scalar(ctx);
    char src0[64]; make_GLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_scalar(ctx, 1, src1, sizeof (src1));
    output_line(ctx, "if (%s %s %s) { break; }", src0, comp, src1);
} // emit_GLSL_BREAKC

static void emit_GLSL_MOVA(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];

    if (vecsize == 1)
    {
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "int(floor(abs(%s) + 0.5) * sign(%s))",
                                 src0, src0);
    } // if

    else
    {
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                            "ivec%d(floor(abs(%s) + vec%d(0.5)) * sign(%s))",
                            vecsize, src0, vecsize, src0);
    } // else

    output_line(ctx, "%s", code);
} // emit_GLSL_MOVA

static void emit_GLSL_DEFB(Context *ctx)
{
    char varname[64]; get_GLSL_destarg_varname(ctx, varname, sizeof (varname));
    push_output(ctx, &ctx->globals);
    output_line(ctx, "const bool %s = %s;",
                varname, ctx->dwords[0] ? "true" : "false");
    pop_output(ctx);
} // emit_GLSL_DEFB

static void emit_GLSL_DEFI(Context *ctx)
{
    char varname[64]; get_GLSL_destarg_varname(ctx, varname, sizeof (varname));
    const int32 *x = (const int32 *) ctx->dwords;
    push_output(ctx, &ctx->globals);
    output_line(ctx, "const ivec4 %s = ivec4(%d, %d, %d, %d);",
                varname, (int) x[0], (int) x[1], (int) x[2], (int) x[3]);
    pop_output(ctx);
} // emit_GLSL_DEFI

EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXCRD)

static void emit_GLSL_TEXKILL(Context *ctx)
{
    char dst[64]; get_GLSL_destarg_varname(ctx, dst, sizeof (dst));
    output_line(ctx, "if (any(lessThan(%s.xyz, vec3(0.0)))) discard;", dst);
} // emit_GLSL_TEXKILL

static void glsl_texld(Context *ctx, const int texldd)
{
    if (!shader_version_atleast(ctx, 1, 4))
    {
        DestArgInfo *info = &ctx->dest_arg;
        char dst[64];
        char sampler[64];
        char code[128] = {0};

        assert(!texldd);

        RegisterList *sreg;
        sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, info->regnum);
        const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);

        // !!! FIXME: this code counts on the register not having swizzles, etc.
        get_GLSL_destarg_varname(ctx, dst, sizeof (dst));
        get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                                sampler, sizeof (sampler));

        if (ttype == TEXTURE_TYPE_2D)
        {
            make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                     "texture2D(%s, %s.xy)",
                                     sampler, dst);
        }
        else if (ttype == TEXTURE_TYPE_CUBE)
        {
            make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                     "textureCube(%s, %s.xyz)",
                                     sampler, dst);
        }
        else if (ttype == TEXTURE_TYPE_VOLUME)
        {
            make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                     "texture3D(%s, %s.xyz)",
                                     sampler, dst);
        }
        else
        {
            fail(ctx, "unexpected texture type");
        } // else
        output_line(ctx, "%s", code);
    } // if

    else if (!shader_version_atleast(ctx, 2, 0))
    {
        // ps_1_4 is different, too!
        fail(ctx, "TEXLD == Shader Model 1.4 unimplemented.");  // !!! FIXME
        return;
    } // else if

    else
    {
        const SourceArgInfo *samp_arg = &ctx->source_args[1];
        RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                          samp_arg->regnum);
        const char *funcname = NULL;
        char src0[64] = { '\0' };
        char src1[64]; get_GLSL_srcarg_varname(ctx, 1, src1, sizeof (src1)); // !!! FIXME: SRC_MOD?
        char src2[64] = { '\0' };
        char src3[64] = { '\0' };

        if (sreg == NULL)
        {
            fail(ctx, "TEXLD using undeclared sampler");
            return;
        } // if

        if (texldd)
        {
            if (sreg->index == TEXTURE_TYPE_2D)
            {
                make_GLSL_srcarg_string_vec2(ctx, 2, src2, sizeof (src2));
                make_GLSL_srcarg_string_vec2(ctx, 3, src3, sizeof (src3));
            } // if
            else
            {
                assert((sreg->index == TEXTURE_TYPE_CUBE) || (sreg->index == TEXTURE_TYPE_VOLUME));
                make_GLSL_srcarg_string_vec3(ctx, 2, src2, sizeof (src2));
                make_GLSL_srcarg_string_vec3(ctx, 3, src3, sizeof (src3));
            } // else
        } // if

        // !!! FIXME: can TEXLDD set instruction_controls?
        // !!! FIXME: does the d3d bias value map directly to GLSL?
        const char *biassep = "";
        char bias[64] = { '\0' };
        if (ctx->instruction_controls == CONTROL_TEXLDB)
        {
            biassep = ", ";
            make_GLSL_srcarg_string_w(ctx, 0, bias, sizeof (bias));
        } // if

        switch ((const TextureType) sreg->index)
        {
            case TEXTURE_TYPE_2D:
                if (ctx->instruction_controls == CONTROL_TEXLDP)
                {
                    funcname = "texture2DProj";
                    make_GLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
                } // if
                else  // texld/texldb
                {
                    funcname = "texture2D";
                    make_GLSL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
                } // else
                break;
            case TEXTURE_TYPE_CUBE:
                if (ctx->instruction_controls == CONTROL_TEXLDP)
                    fail(ctx, "TEXLDP on a cubemap");  // !!! FIXME: is this legal?
                funcname = "textureCube";
                make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
                break;
            case TEXTURE_TYPE_VOLUME:
                if (ctx->instruction_controls == CONTROL_TEXLDP)
                {
                    funcname = "texture3DProj";
                    make_GLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
                } // if
                else  // texld/texldb
                {
                    funcname = "texture3D";
                    make_GLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
                } // else
                break;
            default:
                fail(ctx, "unknown texture type");
                return;
        } // switch

        assert(!isscalar(ctx, ctx->shader_type, samp_arg->regtype, samp_arg->regnum));
        char swiz_str[6] = { '\0' };
        make_GLSL_swizzle_string(swiz_str, sizeof (swiz_str),
                                 samp_arg->swizzle, ctx->dest_arg.writemask);

        char code[128];
        if (texldd)
        {
            make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                     "%sGrad(%s, %s, %s, %s)%s", funcname,
                                     src1, src0, src2, src3, swiz_str);
        } // if
        else
        {
            make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                     "%s(%s, %s%s%s)%s", funcname,
                                     src1, src0, biassep, bias, swiz_str);
        } // else

        output_line(ctx, "%s", code);
    } // else
} // glsl_texld

static void emit_GLSL_TEXLD(Context *ctx)
{
    glsl_texld(ctx, 0);
} // emit_GLSL_TEXLD
    

static void emit_GLSL_TEXBEM(Context *ctx)
{
    DestArgInfo *info = &ctx->dest_arg;
    char dst[64]; get_GLSL_destarg_varname(ctx, dst, sizeof (dst));
    char src[64]; get_GLSL_srcarg_varname(ctx, 0, src, sizeof (src));
    char sampler[64];
    char code[512];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "texture2D(%s, vec2(%s.x + (%s_texbem.x * %s.x) + (%s_texbem.z * %s.y),"
        " %s.y + (%s_texbem.y * %s.x) + (%s_texbem.w * %s.y)))",
        sampler,
        dst, sampler, src, sampler, src,
        dst, sampler, src, sampler, src);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXBEM


static void emit_GLSL_TEXBEML(Context *ctx)
{
    // !!! FIXME: this code counts on the register not having swizzles, etc.
    DestArgInfo *info = &ctx->dest_arg;
    char dst[64]; get_GLSL_destarg_varname(ctx, dst, sizeof (dst));
    char src[64]; get_GLSL_srcarg_varname(ctx, 0, src, sizeof (src));
    char sampler[64];
    char code[512];

    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "(texture2D(%s, vec2(%s.x + (%s_texbem.x * %s.x) + (%s_texbem.z * %s.y),"
        " %s.y + (%s_texbem.y * %s.x) + (%s_texbem.w * %s.y)))) *"
        " ((%s.z * %s_texbeml.x) + %s_texbem.y)",
        sampler,
        dst, sampler, src, sampler, src,
        dst, sampler, src, sampler, src,
        src, sampler, sampler);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXBEML

EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2AR) // !!! FIXME
EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2GB) // !!! FIXME


static void emit_GLSL_TEXM3X2PAD(Context *ctx)
{
    // no-op ... work happens in emit_GLSL_TEXM3X2TEX().
} // emit_GLSL_TEXM3X2PAD

static void emit_GLSL_TEXM3X2TEX(Context *ctx)
{
    if (ctx->texm3x2pad_src0 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char sampler[64];
    char code[512];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_src0,
                            src0, sizeof (src0));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_dst0,
                            src1, sizeof (src1));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src2, sizeof (src2));
    get_GLSL_destarg_varname(ctx, dst, sizeof (dst));

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "texture2D(%s, vec2(dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz)))",
        sampler, src0, src1, src2, dst);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXM3X2TEX

static void emit_GLSL_TEXM3X3PAD(Context *ctx)
{
    // no-op ... work happens in emit_GLSL_TEXM3X3*().
} // emit_GLSL_TEXM3X3PAD

static void emit_GLSL_TEXM3X3TEX(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char sampler[64];
    char code[512];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_GLSL_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      info->regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "Cube" : "3D";

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "texture%s(%s,"
            " vec3(dot(%s.xyz, %s.xyz),"
            " dot(%s.xyz, %s.xyz),"
            " dot(%s.xyz, %s.xyz)))",
        ttypestr, sampler, src0, src1, src2, src3, dst, src4);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXM3X3TEX

static void emit_GLSL_TEXM3X3SPEC_helper(Context *ctx)
{
    if (ctx->glsl_generated_texm3x3spec_helper)
        return;

    ctx->glsl_generated_texm3x3spec_helper = 1;

    push_output(ctx, &ctx->helpers);
    output_line(ctx, "vec3 TEXM3X3SPEC_reflection(const vec3 normal, const vec3 eyeray)");
    output_line(ctx, "{"); ctx->indent++;
    output_line(ctx,   "return (2.0 * ((normal * eyeray) / (normal * normal)) * normal) - eyeray;"); ctx->indent--;
    output_line(ctx, "}");
    output_blank_line(ctx);
    pop_output(ctx);
} // emit_GLSL_TEXM3X3SPEC_helper

static void emit_GLSL_TEXM3X3SPEC(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char src5[64];
    char sampler[64];
    char code[512];

    emit_GLSL_TEXM3X3SPEC_helper(ctx);

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[1].regnum,
                            src5, sizeof (src5));
    get_GLSL_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      info->regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "Cube" : "3D";

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "texture%s(%s, "
            "TEXM3X3SPEC_reflection("
                "vec3("
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz)"
                "),"
                "%s.xyz,"
            ")"
        ")",
        ttypestr, sampler, src0, src1, src2, src3, dst, src4, src5);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXM3X3SPEC

static void emit_GLSL_TEXM3X3VSPEC(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char sampler[64];
    char code[512];

    emit_GLSL_TEXM3X3SPEC_helper(ctx);

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_GLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_GLSL_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      info->regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "Cube" : "3D";

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "texture%s(%s, "
            "TEXM3X3SPEC_reflection("
                "vec3("
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz)"
                "), "
                "vec3(%s.w, %s.w, %s.w)"
            ")"
        ")",
        ttypestr, sampler, src0, src1, src2, src3, dst, src4, src0, src2, dst);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXM3X3VSPEC

static void emit_GLSL_EXPP(Context *ctx)
{
    // !!! FIXME: msdn's asm docs don't list this opcode, I'll have to check the driver documentation.
    emit_GLSL_EXP(ctx);  // I guess this is just partial precision EXP?
} // emit_GLSL_EXPP

static void emit_GLSL_LOGP(Context *ctx)
{
    // LOGP is just low-precision LOG, but we'll take the higher precision.
    emit_GLSL_LOG(ctx);
} // emit_GLSL_LOGP

// common code between CMP and CND.
static void emit_GLSL_comparison_operations(Context *ctx, const char *cmp)
{
    int i, j;
    DestArgInfo *dst = &ctx->dest_arg;
    const SourceArgInfo *srcarg0 = &ctx->source_args[0];
    const int origmask = dst->writemask;
    int used_swiz[4] = { 0, 0, 0, 0 };
    const int writemask[4] = { dst->writemask0, dst->writemask1,
                               dst->writemask2, dst->writemask3 };
    const int src0swiz[4] = { srcarg0->swizzle_x, srcarg0->swizzle_y,
                              srcarg0->swizzle_z, srcarg0->swizzle_w };

    for (i = 0; i < 4; i++)
    {
        int mask = (1 << i);

        if (!writemask[i]) continue;
        if (used_swiz[i]) continue;

        // This is a swizzle we haven't checked yet.
        used_swiz[i] = 1;

        // see if there are any other elements swizzled to match (.yyyy)
        for (j = i + 1; j < 4; j++)
        {
            if (!writemask[j]) continue;
            if (src0swiz[i] != src0swiz[j]) continue;
            mask |= (1 << j);
            used_swiz[j] = 1;
        } // for

        // okay, (mask) should be the writemask of swizzles we like.

        //return make_GLSL_srcarg_string(ctx, idx, (1 << 0));

        char src0[64];
        char src1[64];
        char src2[64];
        make_GLSL_srcarg_string(ctx, 0, (1 << i), src0, sizeof (src0));
        make_GLSL_srcarg_string(ctx, 1, mask, src1, sizeof (src1));
        make_GLSL_srcarg_string(ctx, 2, mask, src2, sizeof (src2));

        set_dstarg_writemask(dst, mask);

        char code[128];
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "((%s %s) ? %s : %s)",
                                 src0, cmp, src1, src2);
        output_line(ctx, "%s", code);
    } // for

    set_dstarg_writemask(dst, origmask);
} // emit_GLSL_comparison_operations

static void emit_GLSL_CND(Context *ctx)
{
    emit_GLSL_comparison_operations(ctx, "> 0.5");
} // emit_GLSL_CND

static void emit_GLSL_DEF(Context *ctx)
{
    const float *val = (const float *) ctx->dwords; // !!! FIXME: could be int?
    char varname[64]; get_GLSL_destarg_varname(ctx, varname, sizeof (varname));
    char val0[32]; floatstr(ctx, val0, sizeof (val0), val[0], 1);
    char val1[32]; floatstr(ctx, val1, sizeof (val1), val[1], 1);
    char val2[32]; floatstr(ctx, val2, sizeof (val2), val[2], 1);
    char val3[32]; floatstr(ctx, val3, sizeof (val3), val[3], 1);

    push_output(ctx, &ctx->globals);
    output_line(ctx, "const vec4 %s = vec4(%s, %s, %s, %s);",
                varname, val0, val1, val2, val3);
    pop_output(ctx);
} // emit_GLSL_DEF

EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2RGB) // !!! FIXME
EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3TEX) // !!! FIXME
EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXM3X2DEPTH) // !!! FIXME
EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3) // !!! FIXME

static void emit_GLSL_TEXM3X3(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char code[512];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_GLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_GLSL_destarg_varname(ctx, dst, sizeof (dst));

    make_GLSL_destarg_assign(ctx, code, sizeof (code),
        "vec4(dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz), 1.0)",
        src0, src1, src2, src3, dst, src4);

    output_line(ctx, "%s", code);
} // emit_GLSL_TEXM3X3

EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXDEPTH) // !!! FIXME

static void emit_GLSL_CMP(Context *ctx)
{
    emit_GLSL_comparison_operations(ctx, ">= 0.0");
} // emit_GLSL_CMP

EMIT_GLSL_OPCODE_UNIMPLEMENTED_FUNC(BEM) // !!! FIXME

static void emit_GLSL_DP2ADD(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_vec2(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_GLSL_srcarg_string_scalar(ctx, 2, src2, sizeof (src2));
    char extra[64]; snprintf(extra, sizeof (extra), " + %s", src2);
    emit_GLSL_dotprod(ctx, src0, src1, extra);
} // emit_GLSL_DP2ADD

static void emit_GLSL_DSX(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "dFdx(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_DSX

static void emit_GLSL_DSY(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_GLSL_destarg_assign(ctx, code, sizeof (code), "dFdy(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_GLSL_DSY

static void emit_GLSL_TEXLDD(Context *ctx)
{
    // !!! FIXME:
    // GLSL 1.30 introduced textureGrad() for this, but it looks like the
    //  functions are overloaded instead of texture2DGrad() (etc).

    // GL_shader_texture_lod and GL_EXT_gpu_shader4 added texture2DGrad*(),
    //  so we'll use them if available. Failing that, we'll just fallback
    //  to a regular texture2D call and hope the mipmap it chooses is close
    //  enough.
    if (!ctx->glsl_generated_texldd_setup)
    {
        ctx->glsl_generated_texldd_setup = 1;
        push_output(ctx, &ctx->preflight);
        output_line(ctx, "#if GL_ARB_shader_texture_lod");
        output_line(ctx, "#extension GL_ARB_shader_texture_lod : enable");
        output_line(ctx, "#define texture2DGrad texture2DGradARB");
        output_line(ctx, "#define texture2DProjGrad texture2DProjARB");
        output_line(ctx, "#elif GL_EXT_gpu_shader4");
        output_line(ctx, "#extension GL_EXT_gpu_shader4 : enable");
        output_line(ctx, "#else");
        output_line(ctx, "#define texture2DGrad(a,b,c,d) texture2D(a,b)");
        output_line(ctx, "#define texture2DProjGrad(a,b,c,d) texture2DProj(a,b)");
        output_line(ctx, "#endif");
        output_blank_line(ctx);
        pop_output(ctx);
    } // if

    glsl_texld(ctx, 1);
} // emit_GLSL_TEXLDD

static void emit_GLSL_SETP(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_GLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];

    // destination is always predicate register (which is type bvec4).
    if (vecsize == 1)
    {
        const char *comp = get_GLSL_comparison_string_scalar(ctx);
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "(%s %s %s)", src0, comp, src1);
    } // if
    else
    {
        const char *comp = get_GLSL_comparison_string_vector(ctx);
        make_GLSL_destarg_assign(ctx, code, sizeof (code),
                                 "%s(%s, %s)", comp, src0, src1);
    } // else

    output_line(ctx, "%s", code);
} // emit_GLSL_SETP

static void emit_GLSL_TEXLDL(Context *ctx)
{
    // !!! FIXME: The spec says we can't use GLSL's texture*Lod() built-ins
    // !!! FIXME:  from fragment shaders for some inexplicable reason.
    // !!! FIXME:  For now, you'll just have to suffer with the potentially
    // !!! FIXME:  wrong mipmap until I can figure something out.
    emit_GLSL_TEXLD(ctx);
} // emit_GLSL_TEXLDL

static void emit_GLSL_BREAKP(Context *ctx)
{
    char src0[64]; make_GLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    output_line(ctx, "if (%s) { break; }", src0);
} // emit_GLSL_BREAKP

static void emit_GLSL_RESERVED(Context *ctx)
{
    // do nothing; fails in the state machine.
} // emit_GLSL_RESERVED

#endif  // SUPPORT_PROFILE_GLSL


// !!! FIXME: A lot of this is cut-and-paste from the GLSL version.
#if !SUPPORT_PROFILE_METAL
#define PROFILE_EMITTER_METAL(op)
#else
#undef AT_LEAST_ONE_PROFILE
#define AT_LEAST_ONE_PROFILE 1
#define PROFILE_EMITTER_METAL(op) emit_METAL_##op,

#define EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(op) \
    static void emit_METAL_##op(Context *ctx) { \
        fail(ctx, #op " unimplemented in Metal profile"); \
    }

static inline const char *get_METAL_register_string(Context *ctx,
                        const RegisterType regtype, const int regnum,
                        char *regnum_str, const size_t regnum_size)
{
    // turns out these are identical at the moment.
    return get_D3D_register_string(ctx,regtype,regnum,regnum_str,regnum_size);
} // get_METAL_register_string

static const char *get_METAL_uniform_type(Context *ctx, const RegisterType rtype)
{
    switch (rtype)
    {
        case REG_TYPE_CONST: return "float4";
        case REG_TYPE_CONSTINT: return "int4";
        case REG_TYPE_CONSTBOOL: return "bool";
        default: fail(ctx, "BUG: used a uniform we don't know how to define.");
    } // switch

    return NULL;
} // get_METAL_uniform_type

static const char *get_METAL_varname_in_buf(Context *ctx, RegisterType rt,
                                           int regnum, char *buf,
                                           const size_t len)
{
    char regnum_str[16];
    const char *regtype_str = get_METAL_register_string(ctx, rt, regnum,
                                              regnum_str, sizeof (regnum_str));

    // We don't separate vars with vs_ or ps_ here, because, for the most part,
    //  there are only local vars in Metal shaders.
    snprintf(buf,len,"%s%s", regtype_str, regnum_str);
    return buf;
} // get_METAL_varname_in_buf


static const char *get_METAL_varname(Context *ctx, RegisterType rt, int regnum)
{
    char buf[64];
    get_METAL_varname_in_buf(ctx, rt, regnum, buf, sizeof (buf));
    return StrDup(ctx, buf);
} // get_METAL_varname


static inline const char *get_METAL_const_array_varname_in_buf(Context *ctx,
                                                const int base, const int size,
                                                char *buf, const size_t buflen)
{
    snprintf(buf, buflen, "%s_const_array_%d_%d", ctx->mainfn, base, size);
    return buf;
} // get_METAL_const_array_varname_in_buf

static const char *get_METAL_const_array_varname(Context *ctx, int base, int size)
{
    char buf[64];
    get_METAL_const_array_varname_in_buf(ctx, base, size, buf, sizeof (buf));
    return StrDup(ctx, buf);
} // get_METAL_const_array_varname


static inline const char *get_METAL_input_array_varname(Context *ctx,
                                                char *buf, const size_t buflen)
{
    snprintf(buf, buflen, "%s", "vertex_input_array");
    return buf;
} // get_METAL_input_array_varname


static const char *get_METAL_uniform_array_varname(Context *ctx,
                                                  const RegisterType regtype,
                                                  char *buf, const size_t len)
{
    const char *shadertype = ctx->shader_type_str;
    const char *type = get_METAL_uniform_type(ctx, regtype);
    snprintf(buf, len, "uniforms.uniforms_%s", type);
    return buf;
} // get_METAL_uniform_array_varname

static const char *get_METAL_destarg_varname(Context *ctx, char *buf, size_t len)
{
    const DestArgInfo *arg = &ctx->dest_arg;
    return get_METAL_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, len);
} // get_METAL_destarg_varname

static const char *get_METAL_srcarg_varname(Context *ctx, const size_t idx,
                                           char *buf, size_t len)
{
    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        *buf = '\0';
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];
    return get_METAL_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, len);
} // get_METAL_srcarg_varname


static const char *make_METAL_destarg_assign(Context *, char *, const size_t,
                                            const char *, ...) ISPRINTF(4,5);

static const char *make_METAL_destarg_assign(Context *ctx, char *buf,
                                            const size_t buflen,
                                            const char *fmt, ...)
{
    int need_parens = 0;
    const DestArgInfo *arg = &ctx->dest_arg;

    if (arg->writemask == 0)
    {
        *buf = '\0';
        return buf;  // no writemask? It's a no-op.
    } // if

    char clampbuf[32] = { '\0' };
    const char *clampleft = "";
    const char *clampright = "";
    if (arg->result_mod & MOD_SATURATE)
    {
        ctx->metal_need_header_common = 1;
        const int vecsize = vecsize_from_writemask(arg->writemask);
        clampleft = "clamp(";
        if (vecsize == 1)
            clampright = ", 0.0, 1.0)";
        else
        {
            snprintf(clampbuf, sizeof (clampbuf),
                     ", float%d(0.0), float%d(1.0))", vecsize, vecsize);
            clampright = clampbuf;
        } // else
    } // if

    // MSDN says MOD_PP is a hint and many implementations ignore it. So do we.

    // CENTROID only allowed in DCL opcodes, which shouldn't come through here.
    assert((arg->result_mod & MOD_CENTROID) == 0);

    if (ctx->predicated)
    {
        fail(ctx, "predicated destinations unsupported");  // !!! FIXME
        *buf = '\0';
        return buf;
    } // if

    char operation[256];
    va_list ap;
    va_start(ap, fmt);
    const int len = vsnprintf(operation, sizeof (operation), fmt, ap);
    va_end(ap);
    if (len >= sizeof (operation))
    {
        fail(ctx, "operation string too large");  // I'm lazy.  :P
        *buf = '\0';
        return buf;
    } // if

    const char *result_shift_str = "";
    switch (arg->result_shift)
    {
        case 0x1: result_shift_str = " * 2.0"; break;
        case 0x2: result_shift_str = " * 4.0"; break;
        case 0x3: result_shift_str = " * 8.0"; break;
        case 0xD: result_shift_str = " / 8.0"; break;
        case 0xE: result_shift_str = " / 4.0"; break;
        case 0xF: result_shift_str = " / 2.0"; break;
    } // switch
    need_parens |= (result_shift_str[0] != '\0');

    char regnum_str[16];
    const char *regtype_str = get_METAL_register_string(ctx, arg->regtype,
                                                       arg->regnum, regnum_str,
                                                       sizeof (regnum_str));
    char writemask_str[6];
    size_t i = 0;
    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
    if (!scalar && !writemask_xyzw(arg->writemask))
    {
        writemask_str[i++] = '.';
        if (arg->writemask0) writemask_str[i++] = 'x';
        if (arg->writemask1) writemask_str[i++] = 'y';
        if (arg->writemask2) writemask_str[i++] = 'z';
        if (arg->writemask3) writemask_str[i++] = 'w';
    } // if
    writemask_str[i] = '\0';
    assert(i < sizeof (writemask_str));

    const char *leftparen = (need_parens) ? "(" : "";
    const char *rightparen = (need_parens) ? ")" : "";

    snprintf(buf, buflen, "%s%s%s = %s%s%s%s%s%s;",
             regtype_str, regnum_str, writemask_str,
             clampleft, leftparen, operation, rightparen, result_shift_str,
             clampright);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_METAL_destarg_assign


static char *make_METAL_swizzle_string(char *swiz_str, const size_t strsize,
                                      const int swizzle, const int writemask)
{
    size_t i = 0;
    if ( (!no_swizzle(swizzle)) || (!writemask_xyzw(writemask)) )
    {
        const int writemask0 = (writemask >> 0) & 0x1;
        const int writemask1 = (writemask >> 1) & 0x1;
        const int writemask2 = (writemask >> 2) & 0x1;
        const int writemask3 = (writemask >> 3) & 0x1;

        const int swizzle_x = (swizzle >> 0) & 0x3;
        const int swizzle_y = (swizzle >> 2) & 0x3;
        const int swizzle_z = (swizzle >> 4) & 0x3;
        const int swizzle_w = (swizzle >> 6) & 0x3;

        swiz_str[i++] = '.';
        if (writemask0) swiz_str[i++] = swizzle_channels[swizzle_x];
        if (writemask1) swiz_str[i++] = swizzle_channels[swizzle_y];
        if (writemask2) swiz_str[i++] = swizzle_channels[swizzle_z];
        if (writemask3) swiz_str[i++] = swizzle_channels[swizzle_w];
    } // if
    assert(i < strsize);
    swiz_str[i] = '\0';
    return swiz_str;
} // make_METAL_swizzle_string


static const char *make_METAL_srcarg_string(Context *ctx, const size_t idx,
                                           const int writemask, char *buf,
                                           const size_t buflen)
{
    *buf = '\0';

    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];

    const char *premod_str = "";
    const char *postmod_str = "";
    switch (arg->src_mod)
    {
        case SRCMOD_NEGATE:
            premod_str = "-";
            break;

        case SRCMOD_BIASNEGATE:
            premod_str = "-(";
            postmod_str = " - 0.5)";
            break;

        case SRCMOD_BIAS:
            premod_str = "(";
            postmod_str = " - 0.5)";
            break;

        case SRCMOD_SIGNNEGATE:
            premod_str = "-((";
            postmod_str = " - 0.5) * 2.0)";
            break;

        case SRCMOD_SIGN:
            premod_str = "((";
            postmod_str = " - 0.5) * 2.0)";
            break;

        case SRCMOD_COMPLEMENT:
            premod_str = "(1.0 - ";
            postmod_str = ")";
            break;

        case SRCMOD_X2NEGATE:
            premod_str = "-(";
            postmod_str = " * 2.0)";
            break;

        case SRCMOD_X2:
            premod_str = "(";
            postmod_str = " * 2.0)";
            break;

        case SRCMOD_DZ:
            fail(ctx, "SRCMOD_DZ unsupported"); return buf; // !!! FIXME
            postmod_str = "_dz";
            break;

        case SRCMOD_DW:
            fail(ctx, "SRCMOD_DW unsupported"); return buf; // !!! FIXME
            postmod_str = "_dw";
            break;

        case SRCMOD_ABSNEGATE:
            ctx->metal_need_header_math = 1;
            premod_str = "-abs(";
            postmod_str = ")";
            break;

        case SRCMOD_ABS:
            ctx->metal_need_header_math = 1;
            premod_str = "abs(";
            postmod_str = ")";
            break;

        case SRCMOD_NOT:
            premod_str = "!";
            break;

        case SRCMOD_NONE:
        case SRCMOD_TOTAL:
             break;  // stop compiler whining.
    } // switch

    const char *regtype_str = NULL;

    if (!arg->relative)
    {
        regtype_str = get_METAL_varname_in_buf(ctx, arg->regtype, arg->regnum,
                                              (char *) alloca(64), 64);
    } // if

    const char *rel_lbracket = "";
    char rel_offset[32] = { '\0' };
    const char *rel_rbracket = "";
    char rel_swizzle[4] = { '\0' };
    const char *rel_regtype_str = "";
    if (arg->relative)
    {
        if (arg->regtype == REG_TYPE_INPUT)
            regtype_str=get_METAL_input_array_varname(ctx,(char*)alloca(64),64);
        else
        {
            assert(arg->regtype == REG_TYPE_CONST);
            const int arrayidx = arg->relative_array->index;
            const int offset = arg->regnum - arrayidx;
            assert(offset >= 0);
            if (arg->relative_array->constant)
            {
                const int arraysize = arg->relative_array->count;
                regtype_str = get_METAL_const_array_varname_in_buf(ctx,
                                arrayidx, arraysize, (char *) alloca(64), 64);
                if (offset != 0)
                    snprintf(rel_offset, sizeof (rel_offset), "%d + ", offset);
            } // if
            else
            {
                regtype_str = get_METAL_uniform_array_varname(ctx, arg->regtype,
                                                      (char *) alloca(64), 64);
                if (offset == 0)
                {
                    snprintf(rel_offset, sizeof (rel_offset),
                             "ARRAYBASE_%d + ", arrayidx);
                } // if
                else
                {
                    snprintf(rel_offset, sizeof (rel_offset),
                             "(ARRAYBASE_%d + %d) + ", arrayidx, offset);
                } // else
            } // else
        } // else

        rel_lbracket = "[";

        rel_regtype_str = get_METAL_varname_in_buf(ctx, arg->relative_regtype,
                                                  arg->relative_regnum,
                                                  (char *) alloca(64), 64);
        rel_swizzle[0] = '.';
        rel_swizzle[1] = swizzle_channels[arg->relative_component];
        rel_swizzle[2] = '\0';
        rel_rbracket = "]";
    } // if

    char swiz_str[6] = { '\0' };
    if (!isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum))
    {
        make_METAL_swizzle_string(swiz_str, sizeof (swiz_str),
                                 arg->swizzle, writemask);
    } // if

    if (regtype_str == NULL)
    {
        fail(ctx, "Unknown source register type.");
        return buf;
    } // if

    snprintf(buf, buflen, "%s%s%s%s%s%s%s%s%s",
             premod_str, regtype_str, rel_lbracket, rel_offset,
             rel_regtype_str, rel_swizzle, rel_rbracket, swiz_str,
             postmod_str);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_METAL_srcarg_string

// generate some convenience functions.
#define MAKE_METAL_SRCARG_STRING_(mask, bitmask) \
    static inline const char *make_METAL_srcarg_string_##mask(Context *ctx, \
                                                const size_t idx, char *buf, \
                                                const size_t buflen) { \
        return make_METAL_srcarg_string(ctx, idx, bitmask, buf, buflen); \
    }
MAKE_METAL_SRCARG_STRING_(x, (1 << 0))
MAKE_METAL_SRCARG_STRING_(y, (1 << 1))
MAKE_METAL_SRCARG_STRING_(z, (1 << 2))
MAKE_METAL_SRCARG_STRING_(w, (1 << 3))
MAKE_METAL_SRCARG_STRING_(scalar, (1 << 0))
MAKE_METAL_SRCARG_STRING_(full, 0xF)
MAKE_METAL_SRCARG_STRING_(masked, ctx->dest_arg.writemask)
MAKE_METAL_SRCARG_STRING_(vec3, 0x7)
MAKE_METAL_SRCARG_STRING_(vec2, 0x3)
#undef MAKE_METAL_SRCARG_STRING_

// special cases for comparison opcodes...

static const char *get_METAL_comparison_string_scalar(Context *ctx)
{
    static const char *comps[] = { "", ">", "==", ">=", "<", "!=", "<=" };
    if (ctx->instruction_controls >= STATICARRAYLEN(comps))
    {
        fail(ctx, "unknown comparison control");
        return "";
    } // if

    return comps[ctx->instruction_controls];
} // get_METAL_comparison_string_scalar

static const char *get_METAL_comparison_string_vector(Context *ctx)
{
    return get_METAL_comparison_string_scalar(ctx);  // standard C operators work for vectors in Metal.
} // get_METAL_comparison_string_vector


static void emit_METAL_start(Context *ctx, const char *profilestr)
{
    if (!shader_is_vertex(ctx) && !shader_is_pixel(ctx))
    {
        failf(ctx, "Shader type %u unsupported in this profile.",
              (uint) ctx->shader_type);
        return;
    } // if

    if (!ctx->mainfn)
    {
        if (shader_is_vertex(ctx))
            ctx->mainfn = StrDup(ctx, "VertexShader");
        else if (shader_is_pixel(ctx))
            ctx->mainfn = StrDup(ctx, "FragmentShader");
    } // if

    set_output(ctx, &ctx->mainline);
    ctx->indent++;
} // emit_METAL_start

static void emit_METAL_RET(Context *ctx);
static void emit_METAL_end(Context *ctx)
{
    // !!! FIXME: maybe handle this at a higher level?
    // ps_1_* writes color to r0 instead oC0. We move it to the right place.
    // We don't have to worry about a RET opcode messing this up, since
        //  RET isn't available before ps_2_0.
    if (shader_is_pixel(ctx) && !shader_version_atleast(ctx, 2, 0))
    {
        set_used_register(ctx, REG_TYPE_COLOROUT, 0, 1);
        output_line(ctx, "oC0 = r0;");
    } // if

    // !!! FIXME: maybe handle this at a higher level?
    // force a RET opcode if we're at the end of the stream without one.
    if (ctx->previous_opcode != OPCODE_RET)
        emit_METAL_RET(ctx);
} // emit_METAL_end

static void emit_METAL_phase(Context *ctx)
{
    // no-op in Metal.
} // emit_METAL_phase

static void emit_METAL_finalize(Context *ctx)
{
    // If we had a relative addressing of REG_TYPE_INPUT, we need to build
    //  an array for it at the start of main(). GLSL doesn't let you specify
    //  arrays of attributes.
    //float4 blah_array[BIGGEST_ARRAY];
    if (ctx->have_relative_input_registers) // !!! FIXME
        fail(ctx, "Relative addressing of input registers not supported.");

    // Insert header includes we need...
    push_output(ctx, &ctx->preflight);
    #define INC_METAL_HEADER(name) \
        if (ctx->metal_need_header_##name) { \
            output_line(ctx, "#include <metal_" #name ">"); \
        }
    INC_METAL_HEADER(common);
    INC_METAL_HEADER(math);
    INC_METAL_HEADER(relational);
    INC_METAL_HEADER(geometric);
    INC_METAL_HEADER(graphics);
    INC_METAL_HEADER(texture);
    #undef INC_METAL_HEADER
    output_blank_line(ctx);
    output_line(ctx, "using namespace metal;");
    output_blank_line(ctx);
    pop_output(ctx);

    // Fill in the shader's mainline function signature.
    push_output(ctx, &ctx->mainline_intro);
    output_line(ctx, "%s %s%s %s (",
                shader_is_vertex(ctx) ? "vertex" : "fragment",
                ctx->outputs ? ctx->mainfn : "void",
                ctx->outputs ? "Output" : "", ctx->mainfn);
    pop_output(ctx);

    push_output(ctx, &ctx->mainline_arguments);
    ctx->indent++;

    const int uniform_count = ctx->uniform_float4_count + ctx->uniform_int4_count + ctx->uniform_bool_count;
    int commas = 0;
    if (uniform_count) commas++;
    if (ctx->inputs) commas++;
    if (commas) commas--;

    if (uniform_count > 0)
    {
        push_output(ctx, &ctx->globals);
        output_line(ctx, "struct %sUniforms", ctx->mainfn);
        output_line(ctx, "{");
        ctx->indent++;
        if (ctx->uniform_float4_count > 0)
            output_line(ctx, "float4 uniforms_float4[%d];", ctx->uniform_float4_count);
        if (ctx->uniform_int4_count > 0)
            output_line(ctx, "int4 uniforms_int4[%d];", ctx->uniform_int4_count);
        if (ctx->uniform_bool_count > 0)
            output_line(ctx, "bool uniforms_bool[%d];", ctx->uniform_bool_count);
        ctx->indent--;
        output_line(ctx, "};");
        pop_output(ctx);

        output_line(ctx, "constant %sUniforms &uniforms [[buffer(16)]]%s", ctx->mainfn, commas ? "," : "");
        commas--;
    } // if

    if (ctx->inputs)
    {
        output_line(ctx, "%sInput input [[stage_in]]%s", ctx->mainfn, commas ? "," : "");
        commas--;
    } // if

    ctx->indent--;
    output_line(ctx, ") {");
    if (ctx->outputs)
    {
        ctx->indent++;
        output_line(ctx, "%sOutput output;", ctx->mainfn);

        push_output(ctx, &ctx->mainline);
        ctx->indent++;
        output_line(ctx, "return output;");
        pop_output(ctx);
    } // if
    pop_output(ctx);

    if (ctx->inputs)
    {
        push_output(ctx, &ctx->inputs);
        output_line(ctx, "};");
        output_blank_line(ctx);
        pop_output(ctx);
    } // if

    if (ctx->outputs)
    {
        push_output(ctx, &ctx->outputs);
        output_line(ctx, "};");
        output_blank_line(ctx);
        pop_output(ctx);
    } // if

    // throw some blank lines around to make source more readable.
    if (ctx->globals)  // don't add a blank line if the section is empty.
    {
        push_output(ctx, &ctx->globals);
        output_blank_line(ctx);
        pop_output(ctx);
    } // if
} // emit_METAL_finalize

static void emit_METAL_global(Context *ctx, RegisterType regtype, int regnum)
{
    char varname[64];
    get_METAL_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    // These aren't actually global in metal, set them up at top of mainline.
    push_output(ctx, &ctx->mainline_top);
    ctx->indent++;

    switch (regtype)
    {
        case REG_TYPE_ADDRESS:
            if (shader_is_vertex(ctx))
                output_line(ctx, "int4 %s;", varname);
            else if (shader_is_pixel(ctx))  // actually REG_TYPE_TEXTURE.
            {
                // We have to map texture registers to temps for ps_1_1, since
                //  they work like temps, initialize with tex coords, and the
                //  ps_1_1 TEX opcode expects to overwrite it.
                if (!shader_version_atleast(ctx, 1, 4))
                    output_line(ctx, "float4 %s = input.%s;",varname,varname);
            } // else if
            break;
        case REG_TYPE_PREDICATE:
            output_line(ctx, "bool4 %s;", varname);
            break;
        case REG_TYPE_TEMP:
            output_line(ctx, "float4 %s;", varname);
            break;
        case REG_TYPE_LOOP:
            break; // no-op. We declare these in for loops at the moment.
        case REG_TYPE_LABEL:
            break; // no-op. If we see it here, it means we optimized it out.
        default:
            fail(ctx, "BUG: we used a register we don't know how to define.");
            break;
    } // switch

    pop_output(ctx);
} // emit_METAL_global

static void emit_METAL_array(Context *ctx, VariableList *var)
{
    // All uniforms (except constant arrays, which only get pushed once at
    //  compile time) are now packed into a single array, so we can batch
    //  the uniform transfers. So this doesn't actually define an array
    //  here; the one, big array is emitted during finalization instead.
    // However, we need to #define the offset into the one, big array here,
    //  and let dereferences use that #define.
    const int base = var->index;
    const int metalbase = ctx->uniform_float4_count;
    push_output(ctx, &ctx->mainline_top);
    ctx->indent++;
    output_line(ctx, "const int ARRAYBASE_%d = %d;", base, metalbase);
    pop_output(ctx);
    var->emit_position = metalbase;
} // emit_METAL_array

static void emit_METAL_const_array(Context *ctx, const ConstantsList *clist,
                                   int base, int size)
{
    char varname[64];
    get_METAL_const_array_varname_in_buf(ctx,base,size,varname,sizeof(varname));

    const char *cstr = NULL;
    push_output(ctx, &ctx->globals);
    output_line(ctx, "constant float4 %s[%d] = {", varname, size);
    ctx->indent++;

    int i;
    for (i = 0; i < size; i++)
    {
        while (clist->constant.type != MOJOSHADER_UNIFORM_FLOAT)
            clist = clist->next;
        assert(clist->constant.index == (base + i));

        char val0[32];
        char val1[32];
        char val2[32];
        char val3[32];
        floatstr(ctx, val0, sizeof (val0), clist->constant.value.f[0], 1);
        floatstr(ctx, val1, sizeof (val1), clist->constant.value.f[1], 1);
        floatstr(ctx, val2, sizeof (val2), clist->constant.value.f[2], 1);
        floatstr(ctx, val3, sizeof (val3), clist->constant.value.f[3], 1);

        output_line(ctx, "float4(%s, %s, %s, %s)%s", val0, val1, val2, val3,
                        (i < (size-1)) ? "," : "");

        clist = clist->next;
    } // for

    ctx->indent--;
    output_line(ctx, "};");
    pop_output(ctx);
} // emit_METAL_const_array

static void emit_METAL_uniform(Context *ctx, RegisterType regtype, int regnum,
                              const VariableList *var)
{
    // Now that we're pushing all the uniforms as one struct, pack these
    //  down, so if we only use register c439, it'll actually map to
    //  uniforms.uniforms_float4[0]. As we push one big struct, this will
    //  prevent uploading unused data.

    const char *utype = get_METAL_uniform_type(ctx, regtype);
    char varname[64];
    char name[64];
    int index = 0;

    get_METAL_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    push_output(ctx, &ctx->mainline_top);
    ctx->indent++;

    if (var == NULL)
    {
        get_METAL_uniform_array_varname(ctx, regtype, name, sizeof (name));

        if (regtype == REG_TYPE_CONST)
            index = ctx->uniform_float4_count;
        else if (regtype == REG_TYPE_CONSTINT)
            index = ctx->uniform_int4_count;
        else if (regtype == REG_TYPE_CONSTBOOL)
            index = ctx->uniform_bool_count;
        else  // get_METAL_uniform_array_varname() would have called fail().
            assert(isfail(ctx));

        // !!! FIXME: can cause unused var warnings in Clang...
        //output_line(ctx, "constant %s &%s = %s[%d];", utype, varname, name, index);
        output_line(ctx, "#define %s %s[%d]", varname, name, index);
        push_output(ctx, &ctx->mainline);
        ctx->indent++;
        output_line(ctx, "#undef %s", varname);  // !!! FIXME: gross.
        pop_output(ctx);
    } // if

    else
    {
        const int arraybase = var->index;
        if (var->constant)
        {
            get_METAL_const_array_varname_in_buf(ctx, arraybase, var->count,
                                                name, sizeof (name));
            index = (regnum - arraybase);
        } // if
        else
        {
            assert(var->emit_position != -1);
            get_METAL_uniform_array_varname(ctx, regtype, name, sizeof (name));
            index = (regnum - arraybase) + var->emit_position;
        } // else

        // !!! FIXME: might trigger unused var warnings in Clang.
        //output_line(ctx, "constant %s &%s = %s[%d];", utype, varname, name, index);
        output_line(ctx, "#define %s %s[%d];", varname, name, index);
        push_output(ctx, &ctx->mainline);
        ctx->indent++;
        output_line(ctx, "#undef %s", varname);  // !!! FIXME: gross.
        pop_output(ctx);
    } // else

    pop_output(ctx);
} // emit_METAL_uniform

static void emit_METAL_sampler(Context *ctx,int stage,TextureType ttype,int tb)
{
    char var[64];
    const char *texsuffix = NULL;
    switch (ttype)
    {
        case TEXTURE_TYPE_2D: texsuffix = "2d"; break;
        case TEXTURE_TYPE_CUBE: texsuffix = "cube"; break;
        case TEXTURE_TYPE_VOLUME: texsuffix = "3d"; break;
        default: assert(!"unexpected texture type"); return;
    } // switch

    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, stage, var, sizeof (var));

    push_output(ctx, &ctx->mainline_arguments);
    ctx->indent++;
    output_line(ctx, "texture%s<float> %s_texture [[texture(%d)]],",
                texsuffix, var, stage);
    output_line(ctx, "sampler %s [[sampler(%d)]],", var, stage);
    pop_output(ctx);

    if (tb)  // This sampler used a ps_1_1 TEXBEM opcode?
    {
        push_output(ctx, &ctx->mainline_top);
        ctx->indent++;
        char name[64];
        const int index = ctx->uniform_float4_count;
        ctx->uniform_float4_count += 2;
        get_METAL_uniform_array_varname(ctx, REG_TYPE_CONST, name, sizeof (name));
        output_line(ctx, "constant float4 &%s_texbem = %s[%d];", var, name, index);
        output_line(ctx, "constant float4 &%s_texbeml = %s[%d];", var, name, index+1);
        pop_output(ctx);
    } // if
} // emit_METAL_sampler

static void emit_METAL_attribute(Context *ctx, RegisterType regtype, int regnum,
                                MOJOSHADER_usage usage, int index, int wmask,
                                int flags)
{
    // !!! FIXME: this function doesn't deal with write masks at all yet!
    const char *usage_str = NULL;
    char index_str[16] = { '\0' };
    char var[64];

    get_METAL_varname_in_buf(ctx, regtype, regnum, var, sizeof (var));

    //assert((flags & MOD_PP) == 0);  // !!! FIXME: is PP allowed?

    if (index != 0)  // !!! FIXME: a lot of these MUST be zero.
        snprintf(index_str, sizeof (index_str), "%u", (uint) index);

    if (shader_is_vertex(ctx))
    {
        // pre-vs3 output registers.
        // these don't ever happen in DCL opcodes, I think. Map to vs_3_*
        //  output registers.
        if (!shader_version_atleast(ctx, 3, 0))
        {
            if (regtype == REG_TYPE_RASTOUT)
            {
                regtype = REG_TYPE_OUTPUT;
                index = regnum;
                switch ((const RastOutType) regnum)
                {
                    case RASTOUT_TYPE_POSITION:
                        usage = MOJOSHADER_USAGE_POSITION;
                        break;
                    case RASTOUT_TYPE_FOG:
                        usage = MOJOSHADER_USAGE_FOG;
                        break;
                    case RASTOUT_TYPE_POINT_SIZE:
                        usage = MOJOSHADER_USAGE_POINTSIZE;
                        break;
                } // switch
            } // if

            else if (regtype == REG_TYPE_ATTROUT)
            {
                regtype = REG_TYPE_OUTPUT;
                usage = MOJOSHADER_USAGE_COLOR;
                index = regnum;
            } // else if

            else if (regtype == REG_TYPE_TEXCRDOUT)
            {
                regtype = REG_TYPE_OUTPUT;
                usage = MOJOSHADER_USAGE_TEXCOORD;
                index = regnum;
            } // else if
        } // if

        if (regtype == REG_TYPE_INPUT)
        {
            push_output(ctx, &ctx->inputs);
            if (buffer_size(ctx->inputs) == 0)
            {
                output_line(ctx, "struct %sInput", ctx->mainfn);
                output_line(ctx, "{");
            } // if

            ctx->indent++;
            output_line(ctx, "float4 %s [[attribute(%d)]];", var, regnum);
            pop_output(ctx);

            push_output(ctx, &ctx->mainline_top);
            ctx->indent++;
            // !!! FIXME: might trigger unused var warnings in Clang.
            //output_line(ctx, "constant float4 &%s = input.%s;", var, var);
            output_line(ctx, "#define %s input.%s", var, var);
            pop_output(ctx);
            push_output(ctx, &ctx->mainline);
            ctx->indent++;
            output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
            pop_output(ctx);
        } // if

        else if (regtype == REG_TYPE_OUTPUT)
        {
            push_output(ctx, &ctx->outputs);
            if (buffer_size(ctx->outputs) == 0)
            {
                output_line(ctx, "struct %sOutput", ctx->mainfn);
                output_line(ctx, "{");
            } // if

            ctx->indent++;

            switch (usage)
            {
                case MOJOSHADER_USAGE_POSITION:
                    output_line(ctx, "float4 %s [[position]];", var);
                    break;
                case MOJOSHADER_USAGE_POINTSIZE:
                    output_line(ctx, "float4 %s [[point_size]];", var);
                    break;
                case MOJOSHADER_USAGE_COLOR:
                    output_line(ctx, "float4 %s [[user(color%d)]];", var, index);
                    break;
                case MOJOSHADER_USAGE_FOG:
                    output_line(ctx, "float4 %s [[user(fog)]];", var);
                    break;
                case MOJOSHADER_USAGE_TEXCOORD:
                    output_line(ctx, "float4 %s [[user(texcoord%d)]];", var, index);
                    break;
                default:
                    // !!! FIXME: we need to deal with some more built-in varyings here.
                    break;
            } // switch

            pop_output(ctx);

            push_output(ctx, &ctx->mainline_top);
            ctx->indent++;
            // !!! FIXME: this doesn't work.
            //output_line(ctx, "float4 &%s = output.%s;", var, var);
            output_line(ctx, "#define %s output.%s", var, var);
            pop_output(ctx);
            push_output(ctx, &ctx->mainline);
            ctx->indent++;
            output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
            pop_output(ctx);
        } // else if

        else
        {
            fail(ctx, "unknown vertex shader attribute register");
        } // else
    } // if

    else if (shader_is_pixel(ctx))
    {
        // samplers DCLs get handled in emit_METAL_sampler().

        if (flags & MOD_CENTROID)  // !!! FIXME
        {
            failf(ctx, "centroid unsupported in %s profile", ctx->profile->name);
            return;
        } // if

        if ((regtype == REG_TYPE_COLOROUT) || (regtype == REG_TYPE_DEPTHOUT))
        {
            push_output(ctx, &ctx->outputs);
            if (buffer_size(ctx->outputs) == 0)
            {
                output_line(ctx, "struct %sOutput", ctx->mainfn);
                output_line(ctx, "{");
            } // if
            ctx->indent++;

            if (regtype == REG_TYPE_COLOROUT)
                output_line(ctx, "float4 %s [[color(%d)]];", var, regnum);
            else if (regtype == REG_TYPE_DEPTHOUT)
                output_line(ctx, "float %s [[depth(any)]];", var);

            pop_output(ctx);

            push_output(ctx, &ctx->mainline_top);
            ctx->indent++;
            // !!! FIXME: this doesn't work.
            //output_line(ctx, "float%s &%s = output.%s;", (regtype == REG_TYPE_DEPTHOUT) ? "" : "4", var, var);
            output_line(ctx, "#define %s output.%s", var, var);
            pop_output(ctx);
            push_output(ctx, &ctx->mainline);
            ctx->indent++;
            output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
            pop_output(ctx);
        } // if

        // !!! FIXME: can you actualy have a texture register with COLOR usage?
        else if ((regtype == REG_TYPE_TEXTURE) ||
                 (regtype == REG_TYPE_INPUT) ||
                 (regtype == REG_TYPE_MISCTYPE))
        {
            int skipreference = 0;
            push_output(ctx, &ctx->inputs);
            if (buffer_size(ctx->inputs) == 0)
            {
                output_line(ctx, "struct %sInput", ctx->mainfn);
                output_line(ctx, "{");
            } // if
            ctx->indent++;

            if (regtype == REG_TYPE_MISCTYPE)
            {
                const MiscTypeType mt = (MiscTypeType) regnum;
                if (mt == MISCTYPE_TYPE_FACE)
                    output_line(ctx, "bool %s [[front_facing]];", var);
                else if (mt == MISCTYPE_TYPE_POSITION)
                    output_line(ctx, "float4 %s [[position]];", var);
                else
                    fail(ctx, "BUG: unhandled misc register");
            } // else if

            else
            {
                if (usage == MOJOSHADER_USAGE_TEXCOORD)
                {
                    // ps_1_1 does a different hack for this attribute.
                    //  Refer to emit_METAL_global()'s REG_TYPE_ADDRESS code.
                    if (!shader_version_atleast(ctx, 1, 4))
                        skipreference = 1;
                    output_line(ctx, "float4 %s [[user(texcoord%d)]];", var, index);
                } // if

                else if (usage == MOJOSHADER_USAGE_COLOR)
                    output_line(ctx, "float4 %s [[user(color%d)]];", var, index);

                else if (usage == MOJOSHADER_USAGE_FOG)
                    output_line(ctx, "float4 %s [[user(fog)]];", var);

                #if 0  // !!! FIXME: Ethan added this for GLSL, but I don't know what it does.  --ryan.
                else if (usage == MOJOSHADER_USAGE_DEPTH) // !!! FIXME: Possibly more! -flibit
                {
                    push_output(ctx, &ctx->globals);
                    output_line(ctx, "attribute vec4 %s;", var);
                    pop_output(ctx);
                } // else if
                #endif
            } // else

            pop_output(ctx);

            // !!! FIXME: can cause unused var warnings in Clang...
            #if 0
            push_output(ctx, &ctx->mainline_top);
            ctx->indent++;
            if ((regtype == REG_TYPE_MISCTYPE)&&(regnum == MISCTYPE_TYPE_FACE))
                output_line(ctx, "constant bool &%s = input.%s;", var, var);
            else if (!skipreference)
                output_line(ctx, "constant float4 &%s = input.%s;", var, var);
            pop_output(ctx);
            #endif

            if (!skipreference)
            {
                push_output(ctx, &ctx->mainline_top);
                ctx->indent++;
                output_line(ctx, "#define %s input.%s", var, var);
                pop_output(ctx);
                push_output(ctx, &ctx->mainline);
                ctx->indent++;
                output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
                pop_output(ctx);
            } // if
        } // else if

        else
        {
            fail(ctx, "unknown pixel shader attribute register");
        } // else
    } // else if

    else
    {
        fail(ctx, "Unknown shader type");  // state machine should catch this.
    } // else
} // emit_METAL_attribute

static void emit_METAL_NOP(Context *ctx)
{
    // no-op is a no-op.  :)
} // emit_METAL_NOP

static void emit_METAL_MOV(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code), "%s", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_MOV

static void emit_METAL_ADD(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code), "%s + %s", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_ADD

static void emit_METAL_SUB(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code), "%s - %s", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_SUB

static void emit_METAL_MAD(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_METAL_srcarg_string_masked(ctx, 2, src2, sizeof (src2));
    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code), "(%s * %s) + %s", src0, src1, src2);
    output_line(ctx, "%s", code);
} // emit_METAL_MAD

static void emit_METAL_MUL(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code), "%s * %s", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_MUL

static void emit_METAL_RCP(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code), "1.0 / %s", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_RCP

static void emit_METAL_RSQ(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "rsqrt(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_RSQ

static void emit_METAL_dotprod(Context *ctx, const char *src0, const char *src1,
                              const char *extra)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char castleft[16] = { '\0' };
    const char *castright = "";
    if (vecsize != 1)
    {
        snprintf(castleft, sizeof (castleft), "float%d(", vecsize);
        castright = ")";
    } // if

    char code[128];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "%sdot(%s, %s)%s%s",
                             castleft, src0, src1, extra, castright);
    output_line(ctx, "%s", code);
} // emit_METAL_dotprod

static void emit_METAL_DP3(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_vec3(ctx, 1, src1, sizeof (src1));
    emit_METAL_dotprod(ctx, src0, src1, "");
} // emit_METAL_DP3

static void emit_METAL_DP4(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_full(ctx, 1, src1, sizeof (src1));
    emit_METAL_dotprod(ctx, src0, src1, "");
} // emit_METAL_DP4

static void emit_METAL_MIN(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "min(%s, %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_MIN

static void emit_METAL_MAX(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "max(%s, %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_MAX

static void emit_METAL_SLT(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];

    // float(bool) or vec(bvec) results in 0.0 or 1.0, like SLT wants.
    if (vecsize == 1)
        make_METAL_destarg_assign(ctx, code, sizeof (code), "float(%s < %s)", src0, src1);
    else
    {
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                  "float%d(%s < %s)", vecsize, src0, src1);
    } // else
    output_line(ctx, "%s", code);
} // emit_METAL_SLT

static void emit_METAL_SGE(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];

    // float(bool) or vec(bvec) results in 0.0 or 1.0, like SGE wants.
    if (vecsize == 1)
    {
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                 "float(%s >= %s)", src0, src1);
    } // if
    else
    {
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                  "float%d(%s >= %s)", vecsize, src0, src1);
    } // else
    output_line(ctx, "%s", code);
} // emit_METAL_SGE

static void emit_METAL_EXP(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "exp2(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_EXP

static void emit_METAL_LOG(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "log2(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_LOG

static void emit_METAL_LIT_helper(Context *ctx)
{
    const char *maxp = "127.9961"; // value from the dx9 reference.

    if (ctx->glsl_generated_lit_helper)
        return;

    ctx->glsl_generated_lit_helper = 1;
    ctx->metal_need_header_common = 1;
    ctx->metal_need_header_math = 1;

    push_output(ctx, &ctx->helpers);
    output_line(ctx, "static float4 LIT(const float4 src)");
    output_line(ctx, "{"); ctx->indent++;
    output_line(ctx,   "const float power = clamp(src.w, -%s, %s);",maxp,maxp);
    output_line(ctx,   "float4 retval = float4(1.0, 0.0, 0.0, 1.0);");
    output_line(ctx,   "if (src.x > 0.0) {"); ctx->indent++;
    output_line(ctx,     "retval.y = src.x;");
    output_line(ctx,     "if (src.y > 0.0) {"); ctx->indent++;
    output_line(ctx,       "retval.z = pow(src.y, power);"); ctx->indent--;
    output_line(ctx,     "}"); ctx->indent--;
    output_line(ctx,   "}");
    output_line(ctx,   "return retval;"); ctx->indent--;
    output_line(ctx, "}");
    output_blank_line(ctx);
    pop_output(ctx);
} // emit_METAL_LIT_helper

static void emit_METAL_LIT(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char code[128];
    emit_METAL_LIT_helper(ctx);
    make_METAL_destarg_assign(ctx, code, sizeof (code), "LIT(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_LIT

static void emit_METAL_DST(Context *ctx)
{
    // !!! FIXME: needs to take ctx->dst_arg.writemask into account.
    char src0_y[64]; make_METAL_srcarg_string_y(ctx, 0, src0_y, sizeof (src0_y));
    char src1_y[64]; make_METAL_srcarg_string_y(ctx, 1, src1_y, sizeof (src1_y));
    char src0_z[64]; make_METAL_srcarg_string_z(ctx, 0, src0_z, sizeof (src0_z));
    char src1_w[64]; make_METAL_srcarg_string_w(ctx, 1, src1_w, sizeof (src1_w));

    char code[128];
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                             "float4(1.0, %s * %s, %s, %s)",
                             src0_y, src1_y, src0_z, src1_w);
    output_line(ctx, "%s", code);
} // emit_METAL_DST

static void emit_METAL_LRP(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_METAL_srcarg_string_masked(ctx, 2, src2, sizeof (src2));
    char code[128];
    ctx->metal_need_header_common = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "mix(%s, %s, %s)",
                             src2, src1, src0);
    output_line(ctx, "%s", code);
} // emit_METAL_LRP

static void emit_METAL_FRC(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "fract(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_FRC

static void emit_METAL_M4X4(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_METAL_srcarg_string_full(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_METAL_srcarg_string_full(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_METAL_srcarg_string_full(ctx, 3, row2, sizeof (row2));
    char row3[64]; make_METAL_srcarg_string_full(ctx, 4, row3, sizeof (row3));
    char code[256];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                    "float4(dot(%s, %s), dot(%s, %s), dot(%s, %s), dot(%s, %s))",
                    src0, row0, src0, row1, src0, row2, src0, row3);
    output_line(ctx, "%s", code);
} // emit_METAL_M4X4

static void emit_METAL_M4X3(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_METAL_srcarg_string_full(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_METAL_srcarg_string_full(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_METAL_srcarg_string_full(ctx, 3, row2, sizeof (row2));
    char code[256];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                                "float3(dot(%s, %s), dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1, src0, row2);
    output_line(ctx, "%s", code);
} // emit_METAL_M4X3

static void emit_METAL_M3X4(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_METAL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_METAL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_METAL_srcarg_string_vec3(ctx, 3, row2, sizeof (row2));
    char row3[64]; make_METAL_srcarg_string_vec3(ctx, 4, row3, sizeof (row3));
    char code[256];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                                "float4(dot(%s, %s), dot(%s, %s), "
                                     "dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1,
                                src0, row2, src0, row3);
    output_line(ctx, "%s", code);
} // emit_METAL_M3X4

static void emit_METAL_M3X3(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_METAL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_METAL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
    char row2[64]; make_METAL_srcarg_string_vec3(ctx, 3, row2, sizeof (row2));
    char code[256];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                                "float3(dot(%s, %s), dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1, src0, row2);
    output_line(ctx, "%s", code);
} // emit_METAL_M3X3

static void emit_METAL_M3X2(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char row0[64]; make_METAL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
    char row1[64]; make_METAL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
    char code[256];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                                "float2(dot(%s, %s), dot(%s, %s))",
                                src0, row0, src0, row1);
    output_line(ctx, "%s", code);
} // emit_METAL_M3X2

static void emit_METAL_CALL(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    if (ctx->loops > 0)
        output_line(ctx, "%s(aL);", src0);
    else
        output_line(ctx, "%s();", src0);
} // emit_METAL_CALL

static void emit_METAL_CALLNZ(Context *ctx)
{
    // !!! FIXME: if src1 is a constbool that's true, we can remove the
    // !!! FIXME:  if. If it's false, we can make this a no-op.
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));

    if (ctx->loops > 0)
        output_line(ctx, "if (%s) { %s(aL); }", src1, src0);
    else
        output_line(ctx, "if (%s) { %s(); }", src1, src0);
} // emit_METAL_CALLNZ

static void emit_METAL_LOOP(Context *ctx)
{
    // !!! FIXME: swizzle?
    char var[64]; get_METAL_srcarg_varname(ctx, 1, var, sizeof (var));
    assert(ctx->source_args[0].regnum == 0);  // in case they add aL1 someday.
    output_line(ctx, "{");
    ctx->indent++;
    output_line(ctx, "const int aLend = %s.x + %s.y;", var, var);
    output_line(ctx, "for (int aL = %s.y; aL < aLend; aL += %s.z) {", var, var);
    ctx->indent++;
} // emit_METAL_LOOP

static void emit_METAL_RET(Context *ctx)
{
    // thankfully, the MSDN specs say a RET _has_ to end a function...no
    //  early returns. So if you hit one, you know you can safely close
    //  a high-level function.
    push_output(ctx, &ctx->postflight);
    output_line(ctx, "}");
    output_blank_line(ctx);
    set_output(ctx, &ctx->subroutines);  // !!! FIXME: is this for LABEL? Maybe set it there so we don't allocate unnecessarily.
} // emit_METAL_RET

static void emit_METAL_ENDLOOP(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "}");
    ctx->indent--;
    output_line(ctx, "}");
} // emit_METAL_ENDLOOP

static void emit_METAL_LABEL(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    const int label = ctx->source_args[0].regnum;
    RegisterList *reg = reglist_find(&ctx->used_registers, REG_TYPE_LABEL, label);
    assert(ctx->output == ctx->subroutines);  // not mainline, etc.
    assert(ctx->indent == 0);  // we shouldn't be in the middle of a function.

    // MSDN specs say CALL* has to come before the LABEL, so we know if we
    //  can ditch the entire function here as unused.
    if (reg == NULL)
        set_output(ctx, &ctx->ignore);  // Func not used. Parse, but don't output.

    // !!! FIXME: it would be nice if we could determine if a function is
    // !!! FIXME:  only called once and, if so, forcibly inline it.

    // !!! FIXME: this worked in GLSL because all our state is global to the shader,
    // !!! FIXME:  but in metal we kept it local to the shader mainline.
    // !!! FIXME:  Can we do C++11 lambdas in Metal to have nested functions?  :)

    const char *uses_loopreg = ((reg) && (reg->misc == 1)) ? "int aL" : "";
    output_line(ctx, "static void %s(%s)", src0, uses_loopreg);
    output_line(ctx, "{");
    ctx->indent++;
} // emit_METAL_LABEL

static void emit_METAL_DCL(Context *ctx)
{
    // no-op. We do this in our emit_attribute() and emit_uniform().
} // emit_METAL_DCL

static void emit_METAL_POW(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                             "pow(abs(%s), %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_POW

static void emit_METAL_CRS(Context *ctx)
{
    // !!! FIXME: needs to take ctx->dst_arg.writemask into account.
    char src0[64]; make_METAL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_vec3(ctx, 1, src1, sizeof (src1));
    char code[128];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code),
                             "cross(%s, %s)", src0, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_CRS

static void emit_METAL_SGN(Context *ctx)
{
    // (we don't need the temporary registers specified for the D3D opcode.)
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_common = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "sign(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_SGN

static void emit_METAL_ABS(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_math = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "abs(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_ABS

static void emit_METAL_NRM(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_geometric = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "normalize(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_NRM

static void emit_METAL_SINCOS(Context *ctx)
{
    // we don't care about the temp registers that <= sm2 demands; ignore them.
    //  sm2 also talks about what components are left untouched vs. undefined,
    //  but we just leave those all untouched with Metal write masks (which
    //  would fulfill the "undefined" requirement, too).
    const int mask = ctx->dest_arg.writemask;
    char src0[64]; make_METAL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    char code[128] = { '\0' };

    ctx->metal_need_header_math = 1;
    if (writemask_x(mask))
        make_METAL_destarg_assign(ctx, code, sizeof (code), "cos(%s)", src0);
    else if (writemask_y(mask))
        make_METAL_destarg_assign(ctx, code, sizeof (code), "sin(%s)", src0);
    else if (writemask_xy(mask))
    {
        // !!! FIXME: can use sincos(), but need to assign cos to a temp, since it needs a reference.
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                 "float2(cos(%s), sin(%s))", src0, src0);
    } // else if

    output_line(ctx, "%s", code);
} // emit_METAL_SINCOS

static void emit_METAL_REP(Context *ctx)
{
    // !!! FIXME:
    // msdn docs say legal loop values are 0 to 255. We can check DEFI values
    //  at parse time, but if they are pulling a value from a uniform, do
    //  we clamp here?
    // !!! FIXME: swizzle is legal here, right?
    char src0[64]; make_METAL_srcarg_string_x(ctx, 0, src0, sizeof (src0));
    const uint rep = (uint) ctx->reps;
    output_line(ctx, "for (int rep%u = 0; rep%u < %s; rep%u++) {",
                rep, rep, src0, rep);
    ctx->indent++;
} // emit_METAL_REP

static void emit_METAL_ENDREP(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "}");
} // emit_METAL_ENDREP

static void emit_METAL_IF(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    output_line(ctx, "if (%s) {", src0);
    ctx->indent++;
} // emit_METAL_IF

static void emit_METAL_IFC(Context *ctx)
{
    const char *comp = get_METAL_comparison_string_scalar(ctx);
    char src0[64]; make_METAL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_scalar(ctx, 1, src1, sizeof (src1));
    output_line(ctx, "if (%s %s %s) {", src0, comp, src1);
    ctx->indent++;
} // emit_METAL_IFC

static void emit_METAL_ELSE(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "} else {");
    ctx->indent++;
} // emit_METAL_ELSE

static void emit_METAL_ENDIF(Context *ctx)
{
    ctx->indent--;
    output_line(ctx, "}");
} // emit_METAL_ENDIF

static void emit_METAL_BREAK(Context *ctx)
{
    output_line(ctx, "break;");
} // emit_METAL_BREAK

static void emit_METAL_BREAKC(Context *ctx)
{
    const char *comp = get_METAL_comparison_string_scalar(ctx);
    char src0[64]; make_METAL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_scalar(ctx, 1, src1, sizeof (src1));
    output_line(ctx, "if (%s %s %s) { break; }", src0, comp, src1);
} // emit_METAL_BREAKC

static void emit_METAL_MOVA(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];

    ctx->metal_need_header_math = 1;
    ctx->metal_need_header_common = 1;

    if (vecsize == 1)
    {
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                 "int(floor(abs(%s) + 0.5) * sign(%s))",
                                 src0, src0);
    } // if

    else
    {
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                            "int%d(floor(abs(%s) + float%d(0.5)) * sign(%s))",
                            vecsize, src0, vecsize, src0);
    } // else

    output_line(ctx, "%s", code);
} // emit_METAL_MOVA

static void emit_METAL_DEFB(Context *ctx)
{
    char varname[64]; get_METAL_destarg_varname(ctx, varname, sizeof (varname));
    push_output(ctx, &ctx->mainline_top);
    ctx->indent++;
    output_line(ctx, "const bool %s = %s;",
                varname, ctx->dwords[0] ? "true" : "false");
    pop_output(ctx);
} // emit_METAL_DEFB

static void emit_METAL_DEFI(Context *ctx)
{
    char varname[64]; get_METAL_destarg_varname(ctx, varname, sizeof (varname));
    const int32 *x = (const int32 *) ctx->dwords;
    push_output(ctx, &ctx->mainline_top);
    ctx->indent++;
    output_line(ctx, "const int4 %s = int4(%d, %d, %d, %d);",
                varname, (int) x[0], (int) x[1], (int) x[2], (int) x[3]);
    pop_output(ctx);
} // emit_METAL_DEFI

EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXCRD)

static void emit_METAL_TEXKILL(Context *ctx)
{
    char dst[64]; get_METAL_destarg_varname(ctx, dst, sizeof (dst));
    ctx->metal_need_header_relational = 1;
    ctx->metal_need_header_graphics = 1;
    output_line(ctx, "if (any(%s.xyz < float3(0.0))) discard_fragment();", dst);
} // emit_METAL_TEXKILL

static void metal_texld(Context *ctx, const int texldd)
{
    ctx->metal_need_header_texture = 1;
    if (!shader_version_atleast(ctx, 1, 4))
    {
        DestArgInfo *info = &ctx->dest_arg;
        char dst[64];
        char sampler[64];
        char code[128] = {0};

        assert(!texldd);

        RegisterList *sreg;
        sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, info->regnum);
        const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);

        char swizzle[4] = { 'x', 'y', 'z', '\0' };
        if (ttype == TEXTURE_TYPE_2D)
            swizzle[2] = '\0';  // "xy" instead of "xyz".

        // !!! FIXME: this code counts on the register not having swizzles, etc.
        get_METAL_destarg_varname(ctx, dst, sizeof (dst));
        get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                                 sampler, sizeof (sampler));

        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                  "%s_texture.sample(%s, %s.%s)",
                                  sampler, sampler, dst, swizzle);
        output_line(ctx, "%s", code);
    } // if

    else if (!shader_version_atleast(ctx, 2, 0))
    {
        // ps_1_4 is different, too!
        fail(ctx, "TEXLD == Shader Model 1.4 unimplemented.");  // !!! FIXME
        return;
    } // else if

    else
    {
        const SourceArgInfo *samp_arg = &ctx->source_args[1];
        RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                          samp_arg->regnum);
        const char *funcname = NULL;
        char src0[64] = { '\0' };
        char src1[64]; get_METAL_srcarg_varname(ctx, 1, src1, sizeof (src1)); // !!! FIXME: SRC_MOD?
        char src2[64] = { '\0' };
        char src3[64] = { '\0' };

        if (sreg == NULL)
        {
            fail(ctx, "TEXLD using undeclared sampler");
            return;
        } // if

        const char *grad = "";
        if (texldd)
        {
            switch ((const TextureType) sreg->index)
            {
                case TEXTURE_TYPE_2D:
                    grad = "2d";
                    make_METAL_srcarg_string_vec2(ctx, 2, src2, sizeof (src2));
                    make_METAL_srcarg_string_vec2(ctx, 3, src3, sizeof (src3));
                    break;
                case TEXTURE_TYPE_VOLUME:
                    grad = "3d";
                    make_METAL_srcarg_string_vec3(ctx, 2, src2, sizeof (src2));
                    make_METAL_srcarg_string_vec3(ctx, 3, src3, sizeof (src3));
                    break;
                case TEXTURE_TYPE_CUBE:
                    grad = "cube";
                    make_METAL_srcarg_string_vec3(ctx, 2, src2, sizeof (src2));
                    make_METAL_srcarg_string_vec3(ctx, 3, src3, sizeof (src3));
                    break;
            } // switch
        } // if

        // !!! FIXME: can TEXLDD set instruction_controls?
        // !!! FIXME: does the d3d bias value map directly to Metal?
        const char *biasleft = "";
        const char *biasright = "";
        char bias[64] = { '\0' };
        if (ctx->instruction_controls == CONTROL_TEXLDB)
        {
            biasleft = ", bias(";
            make_METAL_srcarg_string_w(ctx, 0, bias, sizeof (bias));
            biasright = ")";
        } // if

        // Metal doesn't have a texture2DProj() function, but you just divide
        // your texcoords by texcoords.w to achieve it anyhow, so DIY.
        const char *projop = "";
        char proj[64] = { '\0' };
        if (ctx->instruction_controls == CONTROL_TEXLDP)
        {
            if (sreg->index == TEXTURE_TYPE_CUBE)
                fail(ctx, "TEXLDP on a cubemap");  // !!! FIXME: is this legal?
            projop = " / ";
            make_METAL_srcarg_string_w(ctx, 0, proj, sizeof (proj));
        } // if

        switch ((const TextureType) sreg->index)
        {
            case TEXTURE_TYPE_2D:
                make_METAL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
                break;

            case TEXTURE_TYPE_CUBE:
            case TEXTURE_TYPE_VOLUME:
                make_METAL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
                break;

            default:
                fail(ctx, "unknown texture type");
                return;
        } // switch

        assert(!isscalar(ctx, ctx->shader_type, samp_arg->regtype, samp_arg->regnum));
        char swiz_str[6] = { '\0' };
        make_METAL_swizzle_string(swiz_str, sizeof (swiz_str),
                                 samp_arg->swizzle, ctx->dest_arg.writemask);

        char code[128];
        if (texldd)
        {
            make_METAL_destarg_assign(ctx, code, sizeof (code),
                                     "%s_texture.sample(%s, %s, gradient%s(%s, %s))%s",
                                     src1, src1, src0, grad, src2, src3, swiz_str);
        } // if
        else
        {
            make_METAL_destarg_assign(ctx, code, sizeof (code),
                                     "%s_texture.sample(%s, %s%s%s%s%s%s)%s",
                                     src1, src1, src0, projop, proj,
                                     biasleft, bias, biasright, swiz_str);
        } // else

        output_line(ctx, "%s", code);
    } // else
} // metal_texld

static void emit_METAL_TEXLD(Context *ctx)
{
    metal_texld(ctx, 0);
} // emit_METAL_TEXLD
    

static void emit_METAL_TEXBEM(Context *ctx)
{
    DestArgInfo *info = &ctx->dest_arg;
    char dst[64]; get_METAL_destarg_varname(ctx, dst, sizeof (dst));
    char src[64]; get_METAL_srcarg_varname(ctx, 0, src, sizeof (src));
    char sampler[64];
    char code[512];

    ctx->metal_need_header_texture = 1;

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "%s_texture.sample(%s, float2(%s.x + (%s_texbem.x * %s.x) + (%s_texbem.z * %s.y),"
        " %s.y + (%s_texbem.y * %s.x) + (%s_texbem.w * %s.y)))",
        sampler, sampler,
        dst, sampler, src, sampler, src,
        dst, sampler, src, sampler, src);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXBEM


static void emit_METAL_TEXBEML(Context *ctx)
{
    // !!! FIXME: this code counts on the register not having swizzles, etc.
    DestArgInfo *info = &ctx->dest_arg;
    char dst[64]; get_METAL_destarg_varname(ctx, dst, sizeof (dst));
    char src[64]; get_METAL_srcarg_varname(ctx, 0, src, sizeof (src));
    char sampler[64];
    char code[512];

    ctx->metal_need_header_texture = 1;

    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "(%s_texture.sample(%s, float2(%s.x + (%s_texbem.x * %s.x) + (%s_texbem.z * %s.y),"
        " %s.y + (%s_texbem.y * %s.x) + (%s_texbem.w * %s.y)))) *"
        " ((%s.z * %s_texbeml.x) + %s_texbem.y)",
        sampler, sampler,
        dst, sampler, src, sampler, src,
        dst, sampler, src, sampler, src,
        src, sampler, sampler);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXBEML

EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2AR) // !!! FIXME
EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2GB) // !!! FIXME


static void emit_METAL_TEXM3X2PAD(Context *ctx)
{
    // no-op ... work happens in emit_METAL_TEXM3X2TEX().
} // emit_METAL_TEXM3X2PAD

static void emit_METAL_TEXM3X2TEX(Context *ctx)
{
    if (ctx->texm3x2pad_src0 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char sampler[64];
    char code[512];

    ctx->metal_need_header_texture = 1;
    ctx->metal_need_header_geometric = 1;

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_src0,
                            src0, sizeof (src0));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_dst0,
                            src1, sizeof (src1));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src2, sizeof (src2));
    get_METAL_destarg_varname(ctx, dst, sizeof (dst));

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "%s_texture.sample(%s, float2(dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz)))",
        sampler, sampler, src0, src1, src2, dst);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXM3X2TEX

static void emit_METAL_TEXM3X3PAD(Context *ctx)
{
    // no-op ... work happens in emit_METAL_TEXM3X3*().
} // emit_METAL_TEXM3X3PAD

static void emit_METAL_TEXM3X3TEX(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char sampler[64];
    char code[512];

    ctx->metal_need_header_texture = 1;
    ctx->metal_need_header_geometric = 1;

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_METAL_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      info->regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "Cube" : "3D";

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "texture%s(%s,"
            " float3(dot(%s.xyz, %s.xyz),"
            " dot(%s.xyz, %s.xyz),"
            " dot(%s.xyz, %s.xyz)))",
        ttypestr, sampler, src0, src1, src2, src3, dst, src4);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXM3X3TEX

static void emit_METAL_TEXM3X3SPEC_helper(Context *ctx)
{
    if (ctx->glsl_generated_texm3x3spec_helper)
        return;

    ctx->glsl_generated_texm3x3spec_helper = 1;

    push_output(ctx, &ctx->helpers);
    output_line(ctx, "float3 TEXM3X3SPEC_reflection(const float3 normal, const float3 eyeray)");
    output_line(ctx, "{"); ctx->indent++;
    output_line(ctx,   "return (2.0 * ((normal * eyeray) / (normal * normal)) * normal) - eyeray;"); ctx->indent--;
    output_line(ctx, "}");
    output_blank_line(ctx);
    pop_output(ctx);
} // emit_METAL_TEXM3X3SPEC_helper

static void emit_METAL_TEXM3X3SPEC(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char src5[64];
    char sampler[64];
    char code[512];

    ctx->metal_need_header_texture = 1;
    ctx->metal_need_header_geometric = 1;

    emit_METAL_TEXM3X3SPEC_helper(ctx);

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[1].regnum,
                            src5, sizeof (src5));
    get_METAL_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      info->regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "Cube" : "3D";

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "texture%s(%s, "
            "TEXM3X3SPEC_reflection("
                "float3("
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz)"
                "),"
                "%s.xyz,"
            ")"
        ")",
        ttypestr, sampler, src0, src1, src2, src3, dst, src4, src5);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXM3X3SPEC

static void emit_METAL_TEXM3X3VSPEC(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    DestArgInfo *info = &ctx->dest_arg;
    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char sampler[64];
    char code[512];

    ctx->metal_need_header_texture = 1;
    ctx->metal_need_header_geometric = 1;

    emit_METAL_TEXM3X3SPEC_helper(ctx);

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_METAL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
                            sampler, sizeof (sampler));

    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_METAL_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      info->regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "Cube" : "3D";

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "texture%s(%s, "
            "TEXM3X3SPEC_reflection("
                "float3("
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz), "
                    "dot(%s.xyz, %s.xyz)"
                "), "
                "float3(%s.w, %s.w, %s.w)"
            ")"
        ")",
        ttypestr, sampler, src0, src1, src2, src3, dst, src4, src0, src2, dst);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXM3X3VSPEC

static void emit_METAL_EXPP(Context *ctx)
{
    // !!! FIXME: msdn's asm docs don't list this opcode, I'll have to check the driver documentation.
    emit_METAL_EXP(ctx);  // I guess this is just partial precision EXP?
} // emit_METAL_EXPP

static void emit_METAL_LOGP(Context *ctx)
{
    // LOGP is just low-precision LOG, but we'll take the higher precision.
    emit_METAL_LOG(ctx);
} // emit_METAL_LOGP

// common code between CMP and CND.
static void emit_METAL_comparison_operations(Context *ctx, const char *cmp)
{
    int i, j;
    DestArgInfo *dst = &ctx->dest_arg;
    const SourceArgInfo *srcarg0 = &ctx->source_args[0];
    const int origmask = dst->writemask;
    int used_swiz[4] = { 0, 0, 0, 0 };
    const int writemask[4] = { dst->writemask0, dst->writemask1,
                               dst->writemask2, dst->writemask3 };
    const int src0swiz[4] = { srcarg0->swizzle_x, srcarg0->swizzle_y,
                              srcarg0->swizzle_z, srcarg0->swizzle_w };

    for (i = 0; i < 4; i++)
    {
        int mask = (1 << i);

        if (!writemask[i]) continue;
        if (used_swiz[i]) continue;

        // This is a swizzle we haven't checked yet.
        used_swiz[i] = 1;

        // see if there are any other elements swizzled to match (.yyyy)
        for (j = i + 1; j < 4; j++)
        {
            if (!writemask[j]) continue;
            if (src0swiz[i] != src0swiz[j]) continue;
            mask |= (1 << j);
            used_swiz[j] = 1;
        } // for

        // okay, (mask) should be the writemask of swizzles we like.

        //return make_METAL_srcarg_string(ctx, idx, (1 << 0));

        char src0[64];
        char src1[64];
        char src2[64];
        make_METAL_srcarg_string(ctx, 0, (1 << i), src0, sizeof (src0));
        make_METAL_srcarg_string(ctx, 1, mask, src1, sizeof (src1));
        make_METAL_srcarg_string(ctx, 2, mask, src2, sizeof (src2));

        set_dstarg_writemask(dst, mask);

        char code[128];
        make_METAL_destarg_assign(ctx, code, sizeof (code),
                                 "((%s %s) ? %s : %s)",
                                 src0, cmp, src1, src2);
        output_line(ctx, "%s", code);
    } // for

    set_dstarg_writemask(dst, origmask);
} // emit_METAL_comparison_operations

static void emit_METAL_CND(Context *ctx)
{
    emit_METAL_comparison_operations(ctx, "> 0.5");
} // emit_METAL_CND

static void emit_METAL_DEF(Context *ctx)
{
    const float *val = (const float *) ctx->dwords; // !!! FIXME: could be int?
    char varname[64]; get_METAL_destarg_varname(ctx, varname, sizeof (varname));
    char val0[32]; floatstr(ctx, val0, sizeof (val0), val[0], 1);
    char val1[32]; floatstr(ctx, val1, sizeof (val1), val[1], 1);
    char val2[32]; floatstr(ctx, val2, sizeof (val2), val[2], 1);
    char val3[32]; floatstr(ctx, val3, sizeof (val3), val[3], 1);

    push_output(ctx, &ctx->mainline_top);
    ctx->indent++;
    // The "(void) %s;" is to make the compiler not warn if this isn't used.
    output_line(ctx, "const float4 %s = float4(%s, %s, %s, %s); (void) %s;",
                varname, val0, val1, val2, val3, varname);
    pop_output(ctx);
} // emit_METAL_DEF

EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2RGB) // !!! FIXME
EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3TEX) // !!! FIXME
EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXM3X2DEPTH) // !!! FIXME
EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3) // !!! FIXME

static void emit_METAL_TEXM3X3(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char code[512];

    ctx->metal_need_header_geometric = 1;

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_METAL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_METAL_destarg_varname(ctx, dst, sizeof (dst));

    make_METAL_destarg_assign(ctx, code, sizeof (code),
        "float4(dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz), 1.0)",
        src0, src1, src2, src3, dst, src4);

    output_line(ctx, "%s", code);
} // emit_METAL_TEXM3X3

EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(TEXDEPTH) // !!! FIXME

static void emit_METAL_CMP(Context *ctx)
{
    emit_METAL_comparison_operations(ctx, ">= 0.0");
} // emit_METAL_CMP

EMIT_METAL_OPCODE_UNIMPLEMENTED_FUNC(BEM) // !!! FIXME

static void emit_METAL_DP2ADD(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_vec2(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_METAL_srcarg_string_scalar(ctx, 2, src2, sizeof (src2));
    char extra[64]; snprintf(extra, sizeof (extra), " + %s", src2);
    emit_METAL_dotprod(ctx, src0, src1, extra);
} // emit_METAL_DP2ADD

static void emit_METAL_DSX(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_graphics = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "dfdx(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_DSX

static void emit_METAL_DSY(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char code[128];
    ctx->metal_need_header_graphics = 1;
    make_METAL_destarg_assign(ctx, code, sizeof (code), "dfdy(%s)", src0);
    output_line(ctx, "%s", code);
} // emit_METAL_DSY

static void emit_METAL_TEXLDD(Context *ctx)
{
    metal_texld(ctx, 1);
} // emit_METAL_TEXLDD

static void emit_METAL_SETP(Context *ctx)
{
    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
    char src0[64]; make_METAL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_METAL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
    char code[128];

    // destination is always predicate register (which is type bvec4).
    const char *comp = (vecsize == 1) ?
            get_METAL_comparison_string_scalar(ctx) :
            get_METAL_comparison_string_vector(ctx);

    make_METAL_destarg_assign(ctx, code, sizeof (code),
                              "(%s %s %s)", src0, comp, src1);
    output_line(ctx, "%s", code);
} // emit_METAL_SETP

static void emit_METAL_TEXLDL(Context *ctx)
{
    // !!! FIXME: The spec says we can't use GLSL's texture*Lod() built-ins
    // !!! FIXME:  from fragment shaders for some inexplicable reason.
    // !!! FIXME:  Maybe Metal can do it, but I haven't looked into it yet.
    emit_METAL_TEXLD(ctx);
} // emit_METAL_TEXLDL

static void emit_METAL_BREAKP(Context *ctx)
{
    char src0[64]; make_METAL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
    output_line(ctx, "if (%s) { break; }", src0);
} // emit_METAL_BREAKP

static void emit_METAL_RESERVED(Context *ctx)
{
    // do nothing; fails in the state machine.
} // emit_METAL_RESERVED

#endif  // SUPPORT_PROFILE_METAL


#if !SUPPORT_PROFILE_ARB1
#define PROFILE_EMITTER_ARB1(op)
#else
#undef AT_LEAST_ONE_PROFILE
#define AT_LEAST_ONE_PROFILE 1
#define PROFILE_EMITTER_ARB1(op) emit_ARB1_##op,

static inline const char *get_ARB1_register_string(Context *ctx,
                        const RegisterType regtype, const int regnum,
                        char *regnum_str, const size_t regnum_size)
{
    // turns out these are identical at the moment.
    return get_D3D_register_string(ctx,regtype,regnum,regnum_str,regnum_size);
} // get_ARB1_register_string

static const char *allocate_ARB1_scratch_reg_name(Context *ctx, char *buf,
                                                  const size_t buflen)
{
    const int scratch = allocate_scratch_register(ctx);
    snprintf(buf, buflen, "scratch%d", scratch);
    return buf;
} // allocate_ARB1_scratch_reg_name

static inline const char *get_ARB1_branch_label_name(Context *ctx, const int id,
                                                char *buf, const size_t buflen)
{
    snprintf(buf, buflen, "branch_label%d", id);
    return buf;
} // get_ARB1_branch_label_name

static const char *get_ARB1_varname_in_buf(Context *ctx, const RegisterType rt,
                                           const int regnum, char *buf,
                                           const size_t buflen)
{
    // turns out these are identical at the moment.
    return get_D3D_varname_in_buf(ctx, rt, regnum, buf, buflen);
} // get_ARB1_varname_in_buf

static const char *get_ARB1_varname(Context *ctx, const RegisterType rt,
                                    const int regnum)
{
    // turns out these are identical at the moment.
    return get_D3D_varname(ctx, rt, regnum);
} // get_ARB1_varname


static inline const char *get_ARB1_const_array_varname_in_buf(Context *ctx,
                                                const int base, const int size,
                                                char *buf, const size_t buflen)
{
    snprintf(buf, buflen, "c_array_%d_%d", base, size);
    return buf;
} // get_ARB1_const_array_varname_in_buf


static const char *get_ARB1_const_array_varname(Context *ctx, int base, int size)
{
    char buf[64];
    get_ARB1_const_array_varname_in_buf(ctx, base, size, buf, sizeof (buf));
    return StrDup(ctx, buf);
} // get_ARB1_const_array_varname


static const char *make_ARB1_srcarg_string_in_buf(Context *ctx,
                                                  const SourceArgInfo *arg,
                                                  char *buf, size_t buflen)
{
    // !!! FIXME: this can hit pathological cases where we look like this...
    //
    //    dp3 r1.xyz, t0_bx2, t0_bx2
    //    mad r1.xyz, t0_bias, 1-r1, t0_bx2
    //
    // ...which do a lot of duplicate work in arb1...
    //
    //    SUB scratch0, t0, { 0.5, 0.5, 0.5, 0.5 };
    //    MUL scratch0, scratch0, { 2.0, 2.0, 2.0, 2.0 };
    //    SUB scratch1, t0, { 0.5, 0.5, 0.5, 0.5 };
    //    MUL scratch1, scratch1, { 2.0, 2.0, 2.0, 2.0 };
    //    DP3 r1.xyz, scratch0, scratch1;
    //    SUB scratch0, t0, { 0.5, 0.5, 0.5, 0.5 };
    //    SUB scratch1, { 1.0, 1.0, 1.0, 1.0 }, r1;
    //    SUB scratch2, t0, { 0.5, 0.5, 0.5, 0.5 };
    //    MUL scratch2, scratch2, { 2.0, 2.0, 2.0, 2.0 };
    //    MAD r1.xyz, scratch0, scratch1, scratch2;
    //
    // ...notice that the dp3 calculates the same value into two scratch
    //  registers. This case is easier to handle; just see if multiple
    //  source args are identical, build it up once, and use the same
    //  scratch register for multiple arguments in that opcode.
    //  Even better still, only calculate things once across instructions,
    //  and be smart about letting it linger in a scratch register until we
    //  definitely don't need the calculation anymore. That's harder to
    //  write, though.

    char regnum_str[16] = { '\0' };

    // !!! FIXME: use get_ARB1_varname_in_buf() instead?
    const char *regtype_str = NULL;
    if (!arg->relative)
    {
        regtype_str = get_ARB1_register_string(ctx, arg->regtype,
                                               arg->regnum, regnum_str,
                                               sizeof (regnum_str));
    } // if

    const char *rel_lbracket = "";
    char rel_offset[32] = { '\0' };
    const char *rel_rbracket = "";
    char rel_swizzle[4] = { '\0' };
    const char *rel_regtype_str = "";
    if (arg->relative)
    {
        rel_regtype_str = get_ARB1_varname_in_buf(ctx, arg->relative_regtype,
                                                  arg->relative_regnum,
                                                  (char *) alloca(64), 64);

        rel_swizzle[0] = '.';
        rel_swizzle[1] = swizzle_channels[arg->relative_component];
        rel_swizzle[2] = '\0';

        if (!support_nv2(ctx))
        {
            // The address register in ARB1 only allows the '.x' component, so
            //  we need to load the component we need from a temp vector
            //  register into .x as needed.
            assert(arg->relative_regtype == REG_TYPE_ADDRESS);
            assert(arg->relative_regnum == 0);
            if (ctx->last_address_reg_component != arg->relative_component)
            {
                output_line(ctx, "ARL %s.x, addr%d.%c;", rel_regtype_str,
                            arg->relative_regnum,
                            swizzle_channels[arg->relative_component]);
                ctx->last_address_reg_component = arg->relative_component;
            } // if

            rel_swizzle[1] = 'x';
        } // if

        if (arg->regtype == REG_TYPE_INPUT)
            regtype_str = "vertex.attrib";
        else
        {
            assert(arg->regtype == REG_TYPE_CONST);
            const int arrayidx = arg->relative_array->index;
            const int arraysize = arg->relative_array->count;
            const int offset = arg->regnum - arrayidx;
            assert(offset >= 0);
            regtype_str = get_ARB1_const_array_varname_in_buf(ctx, arrayidx,
                                           arraysize, (char *) alloca(64), 64);
            if (offset != 0)
                snprintf(rel_offset, sizeof (rel_offset), " + %d", offset);
        } // else

        rel_lbracket = "[";
        rel_rbracket = "]";
    } // if

    // This is the source register with everything but swizzle and source mods.
    snprintf(buf, buflen, "%s%s%s%s%s%s%s", regtype_str, regnum_str,
             rel_lbracket, rel_regtype_str, rel_swizzle, rel_offset,
             rel_rbracket);

    // Some of the source mods need to generate instructions to a temp
    //  register, in which case we'll replace the register name.
    const SourceMod mod = arg->src_mod;
    const int inplace = ( (mod == SRCMOD_NONE) || (mod == SRCMOD_NEGATE) ||
                          ((mod == SRCMOD_ABS) && support_nv2(ctx)) );

    if (!inplace)
    {
        const size_t len = 64;
        char *stackbuf = (char *) alloca(len);
        regtype_str = allocate_ARB1_scratch_reg_name(ctx, stackbuf, len);
        regnum_str[0] = '\0'; // move value to scratch register.
        rel_lbracket = "";   // scratch register won't use array.
        rel_rbracket = "";
        rel_offset[0] = '\0';
        rel_swizzle[0] = '\0';
        rel_regtype_str = "";
    } // if

    const char *premod_str = "";
    const char *postmod_str = "";
    switch (mod)
    {
        case SRCMOD_NEGATE:
            premod_str = "-";
            break;

        case SRCMOD_BIASNEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_BIAS:
            output_line(ctx, "SUB %s, %s, { 0.5, 0.5, 0.5, 0.5 };",
                        regtype_str, buf);
            break;

        case SRCMOD_SIGNNEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_SIGN:
            output_line(ctx,
                "MAD %s, %s, { 2.0, 2.0, 2.0, 2.0 }, { -1.0, -1.0, -1.0, -1.0 };",
                regtype_str, buf);
            break;

        case SRCMOD_COMPLEMENT:
            output_line(ctx, "SUB %s, { 1.0, 1.0, 1.0, 1.0 }, %s;",
                        regtype_str, buf);
            break;

        case SRCMOD_X2NEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_X2:
            output_line(ctx, "MUL %s, %s, { 2.0, 2.0, 2.0, 2.0 };",
                        regtype_str, buf);
            break;

        case SRCMOD_DZ:
            fail(ctx, "SRCMOD_DZ currently unsupported in arb1");
            postmod_str = "_dz";
            break;

        case SRCMOD_DW:
            fail(ctx, "SRCMOD_DW currently unsupported in arb1");
            postmod_str = "_dw";
            break;

        case SRCMOD_ABSNEGATE:
            premod_str = "-";
            // fall through.
        case SRCMOD_ABS:
            if (!support_nv2(ctx))  // GL_NV_vertex_program2_option adds this.
                output_line(ctx, "ABS %s, %s;", regtype_str, buf);
            else
            {
                premod_str = (mod == SRCMOD_ABSNEGATE) ? "-|" : "|";
                postmod_str = "|";
            } // else
            break;

        case SRCMOD_NOT:
            fail(ctx, "SRCMOD_NOT currently unsupported in arb1");
            premod_str = "!";
            break;

        case SRCMOD_NONE:
        case SRCMOD_TOTAL:
             break;  // stop compiler whining.
    } // switch

    char swizzle_str[6];
    size_t i = 0;

    if (support_nv4(ctx))  // vFace must be output as "vFace.x" in nv4.
    {
        if (arg->regtype == REG_TYPE_MISCTYPE)
        {
            if ( ((const MiscTypeType) arg->regnum) == MISCTYPE_TYPE_FACE )
            {
                swizzle_str[i++] = '.';
                swizzle_str[i++] = 'x';
            } // if
        } // if
    } // if

    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
    if (!scalar && !no_swizzle(arg->swizzle))
    {
        swizzle_str[i++] = '.';

        // .xxxx is the same as .x, but .xx is illegal...scalar or full!
        if (replicate_swizzle(arg->swizzle))
            swizzle_str[i++] = swizzle_channels[arg->swizzle_x];
        else
        {
            swizzle_str[i++] = swizzle_channels[arg->swizzle_x];
            swizzle_str[i++] = swizzle_channels[arg->swizzle_y];
            swizzle_str[i++] = swizzle_channels[arg->swizzle_z];
            swizzle_str[i++] = swizzle_channels[arg->swizzle_w];
        } // else
    } // if
    swizzle_str[i] = '\0';
    assert(i < sizeof (swizzle_str));

    snprintf(buf, buflen, "%s%s%s%s%s%s%s%s%s%s", premod_str,
             regtype_str, regnum_str, rel_lbracket,
             rel_regtype_str, rel_swizzle, rel_offset, rel_rbracket,
             swizzle_str, postmod_str);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_ARB1_srcarg_string_in_buf

static const char *get_ARB1_destarg_varname(Context *ctx, char *buf,
                                            const size_t buflen)
{
    const DestArgInfo *arg = &ctx->dest_arg;
    return get_ARB1_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, buflen);
} // get_ARB1_destarg_varname

static const char *get_ARB1_srcarg_varname(Context *ctx, const size_t idx,
                                           char *buf, const size_t buflen)
{
    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        *buf = '\0';
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];
    return get_ARB1_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, buflen);
} // get_ARB1_srcarg_varname


static const char *make_ARB1_destarg_string(Context *ctx, char *buf,
                                            const size_t buflen)
{
    const DestArgInfo *arg = &ctx->dest_arg;

    *buf = '\0';

    const char *sat_str = "";
    if (arg->result_mod & MOD_SATURATE)
    {
        // nv4 can use ".SAT" in all program types.
        // For less than nv4, the "_SAT" modifier is only available in
        //  fragment shaders. Every thing else will fake it later in
        //  emit_ARB1_dest_modifiers() ...
        if (support_nv4(ctx))
            sat_str = ".SAT";
        else if (shader_is_pixel(ctx))
            sat_str = "_SAT";
    } // if

    const char *pp_str = "";
    if (arg->result_mod & MOD_PP)
    {
        // Most ARB1 profiles can't do partial precision (MOD_PP), but that's
        //  okay. The spec says lots of Direct3D implementations ignore the
        //  flag anyhow.
        if (support_nv4(ctx))
            pp_str = "H";
    } // if

    // CENTROID only allowed in DCL opcodes, which shouldn't come through here.
    assert((arg->result_mod & MOD_CENTROID) == 0);

    char regnum_str[16];
    const char *regtype_str = get_ARB1_register_string(ctx, arg->regtype,
                                                       arg->regnum, regnum_str,
                                                       sizeof (regnum_str));
    if (regtype_str == NULL)
    {
        fail(ctx, "Unknown destination register type.");
        return buf;
    } // if

    char writemask_str[6];
    size_t i = 0;
    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
    if (!scalar && !writemask_xyzw(arg->writemask))
    {
        writemask_str[i++] = '.';
        if (arg->writemask0) writemask_str[i++] = 'x';
        if (arg->writemask1) writemask_str[i++] = 'y';
        if (arg->writemask2) writemask_str[i++] = 'z';
        if (arg->writemask3) writemask_str[i++] = 'w';
    } // if
    writemask_str[i] = '\0';
    assert(i < sizeof (writemask_str));

    //const char *pred_left = "";
    //const char *pred_right = "";
    char pred[32] = { '\0' };
    if (ctx->predicated)
    {
        fail(ctx, "dest register predication currently unsupported in arb1");
        return buf;
        //pred_left = "(";
        //pred_right = ") ";
        make_ARB1_srcarg_string_in_buf(ctx, &ctx->predicate_arg,
                                       pred, sizeof (pred));
    } // if

    snprintf(buf, buflen, "%s%s %s%s%s", pp_str, sat_str,
             regtype_str, regnum_str, writemask_str);
    // !!! FIXME: make sure the scratch buffer was large enough.
    return buf;
} // make_ARB1_destarg_string


static void emit_ARB1_dest_modifiers(Context *ctx)
{
    const DestArgInfo *arg = &ctx->dest_arg;

    if (arg->result_shift != 0x0)
    {
        char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        const char *multiplier = NULL;

        switch (arg->result_shift)
        {
            case 0x1: multiplier = "2.0"; break;
            case 0x2: multiplier = "4.0"; break;
            case 0x3: multiplier = "8.0"; break;
            case 0xD: multiplier = "0.125"; break;
            case 0xE: multiplier = "0.25"; break;
            case 0xF: multiplier = "0.5"; break;
        } // switch

        if (multiplier != NULL)
        {
            char var[64]; get_ARB1_destarg_varname(ctx, var, sizeof (var));
            output_line(ctx, "MUL%s, %s, %s;", dst, var, multiplier);
        } // if
    } // if

    if (arg->result_mod & MOD_SATURATE)
    {
        // nv4 and/or pixel shaders just used the "SAT" modifier, instead.
        if ( (!support_nv4(ctx)) && (!shader_is_pixel(ctx)) )
        {
            char var[64]; get_ARB1_destarg_varname(ctx, var, sizeof (var));
            char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
            output_line(ctx, "MIN%s, %s, 1.0;", dst, var);
            output_line(ctx, "MAX%s, %s, 0.0;", dst, var);
        } // if
    } // if
} // emit_ARB1_dest_modifiers


static const char *make_ARB1_srcarg_string(Context *ctx, const size_t idx,
                                           char *buf, const size_t buflen)
{
    if (idx >= STATICARRAYLEN(ctx->source_args))
    {
        fail(ctx, "Too many source args");
        *buf = '\0';
        return buf;
    } // if

    const SourceArgInfo *arg = &ctx->source_args[idx];
    return make_ARB1_srcarg_string_in_buf(ctx, arg, buf, buflen);
} // make_ARB1_srcarg_string

static void emit_ARB1_opcode_ds(Context *ctx, const char *opcode)
{
    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
    output_line(ctx, "%s%s, %s;", opcode, dst, src0);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_opcode_ds

static void emit_ARB1_opcode_dss(Context *ctx, const char *opcode)
{
    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));
    output_line(ctx, "%s%s, %s, %s;", opcode, dst, src0, src1);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_opcode_dss

static void emit_ARB1_opcode_dsss(Context *ctx, const char *opcode)
{
    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_ARB1_srcarg_string(ctx, 2, src2, sizeof (src2));
    output_line(ctx, "%s%s, %s, %s, %s;", opcode, dst, src0, src1, src2);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_opcode_dsss


#define EMIT_ARB1_OPCODE_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_D_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_d(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_S_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_s(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_SS_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_ss(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_DS_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_ds(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_DSS_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_dss(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_DSSS_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_dsss(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_DSSSS_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        emit_ARB1_opcode_dssss(ctx, #op); \
    }
#define EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(op) \
    static void emit_ARB1_##op(Context *ctx) { \
        failf(ctx, #op " unimplemented in %s profile", ctx->profile->name); \
    }


static void emit_ARB1_start(Context *ctx, const char *profilestr)
{
    const char *shader_str = NULL;
    const char *shader_full_str = NULL;
    if (shader_is_vertex(ctx))
    {
        shader_str = "vp";
        shader_full_str = "vertex";
    } // if
    else if (shader_is_pixel(ctx))
    {
        shader_str = "fp";
        shader_full_str = "fragment";
    } // else if
    else
    {
        failf(ctx, "Shader type %u unsupported in this profile.",
              (uint) ctx->shader_type);
        return;
    } // if

    set_output(ctx, &ctx->preflight);

    if (strcmp(profilestr, MOJOSHADER_PROFILE_ARB1) == 0)
        output_line(ctx, "!!ARB%s1.0", shader_str);

    #if SUPPORT_PROFILE_ARB1_NV
    else if (strcmp(profilestr, MOJOSHADER_PROFILE_NV2) == 0)
    {
        ctx->profile_supports_nv2 = 1;
        output_line(ctx, "!!ARB%s1.0", shader_str);
        output_line(ctx, "OPTION NV_%s_program2;", shader_full_str);
    } // else if

    else if (strcmp(profilestr, MOJOSHADER_PROFILE_NV3) == 0)
    {
        // there's no NV_fragment_program3, so just use 2.
        const int ver = shader_is_pixel(ctx) ? 2 : 3;
        ctx->profile_supports_nv2 = 1;
        ctx->profile_supports_nv3 = 1;
        output_line(ctx, "!!ARB%s1.0", shader_str);
        output_line(ctx, "OPTION NV_%s_program%d;", shader_full_str, ver);
    } // else if

    else if (strcmp(profilestr, MOJOSHADER_PROFILE_NV4) == 0)
    {
        ctx->profile_supports_nv2 = 1;
        ctx->profile_supports_nv3 = 1;
        ctx->profile_supports_nv4 = 1;
        output_line(ctx, "!!NV%s4.0", shader_str);
    } // else if
    #endif

    else
    {
        failf(ctx, "Profile '%s' unsupported or unknown.", profilestr);
    } // else

    set_output(ctx, &ctx->mainline);
} // emit_ARB1_start

static void emit_ARB1_end(Context *ctx)
{
    // ps_1_* writes color to r0 instead oC0. We move it to the right place.
    // We don't have to worry about a RET opcode messing this up, since
    //  RET isn't available before ps_2_0.
    if (shader_is_pixel(ctx) && !shader_version_atleast(ctx, 2, 0))
    {
        set_used_register(ctx, REG_TYPE_COLOROUT, 0, 1);
        output_line(ctx, "MOV oC0, r0;");
    } // if

    output_line(ctx, "END");
} // emit_ARB1_end

static void emit_ARB1_phase(Context *ctx)
{
    // no-op in arb1.
} // emit_ARB1_phase

static inline const char *arb1_float_temp(const Context *ctx)
{
    // nv4 lets you specify data type.
    return (support_nv4(ctx)) ? "FLOAT TEMP" : "TEMP";
} // arb1_float_temp

static void emit_ARB1_finalize(Context *ctx)
{
    push_output(ctx, &ctx->preflight);

    if (shader_is_vertex(ctx) && !ctx->arb1_wrote_position)
        output_line(ctx, "OPTION ARB_position_invariant;");

    if (shader_is_pixel(ctx) && ctx->have_multi_color_outputs)
        output_line(ctx, "OPTION ARB_draw_buffers;");

    pop_output(ctx);

    const char *tmpstr = arb1_float_temp(ctx);
    int i;
    push_output(ctx, &ctx->globals);
    for (i = 0; i < ctx->max_scratch_registers; i++)
    {
        char buf[64];
        allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));
        output_line(ctx, "%s %s;", tmpstr, buf);
    } // for

    // nv2 fragment programs (and anything nv4) have a real REP/ENDREP.
    if ( (support_nv2(ctx)) && (!shader_is_pixel(ctx)) && (!support_nv4(ctx)) )
    {
        // set up temps for nv2 REP/ENDREP emulation through branching.
        for (i = 0; i < ctx->max_reps; i++)
            output_line(ctx, "TEMP rep%d;", i);
    } // if

    pop_output(ctx);
    assert(ctx->scratch_registers == ctx->max_scratch_registers);
} // emit_ARB1_finalize

static void emit_ARB1_global(Context *ctx, RegisterType regtype, int regnum)
{
    // !!! FIXME: dependency on ARB1 profile.  // !!! FIXME about FIXME: huh?
    char varname[64];
    get_ARB1_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    push_output(ctx, &ctx->globals);
    switch (regtype)
    {
        case REG_TYPE_ADDRESS:
            if (shader_is_pixel(ctx))  // actually REG_TYPE_TEXTURE.
            {
                // We have to map texture registers to temps for ps_1_1, since
                //  they work like temps, initialize with tex coords, and the
                //  ps_1_1 TEX opcode expects to overwrite it.
                if (!shader_version_atleast(ctx, 1, 4))
                {
                    output_line(ctx, "%s %s;", arb1_float_temp(ctx), varname);
                    push_output(ctx, &ctx->mainline_top);
                    output_line(ctx, "MOV %s, fragment.texcoord[%d];",
                                varname, regnum);
                    pop_output(ctx);
                } // if
                break;
            } // if

            // nv4 replaced address registers with generic int registers.
            if (support_nv4(ctx))
                output_line(ctx, "INT TEMP %s;", varname);
            else
            {
                // nv2 has four-component address already, but stock arb1 has
                //  to emulate it in a temporary, and move components to the
                //  scalar ADDRESS register on demand.
                output_line(ctx, "ADDRESS %s;", varname);
                if (!support_nv2(ctx))
                    output_line(ctx, "TEMP addr%d;", regnum);
            } // else
            break;

        //case REG_TYPE_PREDICATE:
        //    output_line(ctx, "bvec4 %s;", varname);
        //    break;
        case REG_TYPE_TEMP:
            output_line(ctx, "%s %s;", arb1_float_temp(ctx), varname);
            break;
        //case REG_TYPE_LOOP:
        //    break; // no-op. We declare these in for loops at the moment.
        //case REG_TYPE_LABEL:
        //    break; // no-op. If we see it here, it means we optimized it out.
        default:
            fail(ctx, "BUG: we used a register we don't know how to define.");
            break;
    } // switch
    pop_output(ctx);
} // emit_ARB1_global

static void emit_ARB1_array(Context *ctx, VariableList *var)
{
    // All uniforms are now packed tightly into the program.local array,
    //  instead of trying to map them to the d3d registers. So this needs to
    //  map to the next piece of the array we haven't used yet. Thankfully,
    //  arb1 lets you make a PARAM array that maps to a subset of another
    //  array; we don't need to do offsets, since myarray[0] can map to
    //  program.local[5] without any extra math from us.
    const int base = var->index;
    const int size = var->count;
    const int arb1base = ctx->uniform_float4_count +
                         ctx->uniform_int4_count +
                         ctx->uniform_bool_count;
    char varname[64];
    get_ARB1_const_array_varname_in_buf(ctx, base, size, varname, sizeof (varname));
    push_output(ctx, &ctx->globals);
    output_line(ctx, "PARAM %s[%d] = { program.local[%d..%d] };", varname,
                size, arb1base, (arb1base + size) - 1);
    pop_output(ctx);
    var->emit_position = arb1base;
} // emit_ARB1_array

static void emit_ARB1_const_array(Context *ctx, const ConstantsList *clist,
                                  int base, int size)
{
    char varname[64];
    get_ARB1_const_array_varname_in_buf(ctx, base, size, varname, sizeof (varname));
    int i;

    push_output(ctx, &ctx->globals);
    output_line(ctx, "PARAM %s[%d] = {", varname, size);
    ctx->indent++;

    for (i = 0; i < size; i++)
    {
        while (clist->constant.type != MOJOSHADER_UNIFORM_FLOAT)
            clist = clist->next;
        assert(clist->constant.index == (base + i));

        char val0[32];
        char val1[32];
        char val2[32];
        char val3[32];
        floatstr(ctx, val0, sizeof (val0), clist->constant.value.f[0], 1);
        floatstr(ctx, val1, sizeof (val1), clist->constant.value.f[1], 1);
        floatstr(ctx, val2, sizeof (val2), clist->constant.value.f[2], 1);
        floatstr(ctx, val3, sizeof (val3), clist->constant.value.f[3], 1);

        output_line(ctx, "{ %s, %s, %s, %s }%s", val0, val1, val2, val3,
                    (i < (size-1)) ? "," : "");

        clist = clist->next;
    } // for

    ctx->indent--;
    output_line(ctx, "};");
    pop_output(ctx);
} // emit_ARB1_const_array

static void emit_ARB1_uniform(Context *ctx, RegisterType regtype, int regnum,
                              const VariableList *var)
{
    // We pack these down into the program.local array, so if we only use
    //  register c439, it'll actually map to program.local[0]. This will
    //  prevent overflows when we actually have enough resources to run.

    const char *arrayname = "program.local";
    int index = 0;

    char varname[64];
    get_ARB1_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    push_output(ctx, &ctx->globals);

    if (var == NULL)
    {
        // all types share one array (rather, all types convert to float4).
        index = ctx->uniform_float4_count + ctx->uniform_int4_count +
                ctx->uniform_bool_count;
    } // if

    else
    {
        const int arraybase = var->index;
        if (var->constant)
        {
            const int arraysize = var->count;
            arrayname = get_ARB1_const_array_varname_in_buf(ctx, arraybase,
                                        arraysize, (char *) alloca(64), 64);
            index = (regnum - arraybase);
        } // if
        else
        {
            assert(var->emit_position != -1);
            index = (regnum - arraybase) + var->emit_position;
        } // else
    } // else

    output_line(ctx, "PARAM %s = %s[%d];", varname, arrayname, index);
    pop_output(ctx);
} // emit_ARB1_uniform

static void emit_ARB1_sampler(Context *ctx,int stage,TextureType ttype,int tb)
{
    // this is mostly a no-op...you don't predeclare samplers in arb1.

    if (tb)  // This sampler used a ps_1_1 TEXBEM opcode?
    {
        const int index = ctx->uniform_float4_count + ctx->uniform_int4_count +
                          ctx->uniform_bool_count;
        char var[64];
        get_ARB1_varname_in_buf(ctx, REG_TYPE_SAMPLER, stage, var, sizeof(var));
        push_output(ctx, &ctx->globals);
        output_line(ctx, "PARAM %s_texbem = program.local[%d];", var, index);
        output_line(ctx, "PARAM %s_texbeml = program.local[%d];", var, index+1);
        pop_output(ctx);
        ctx->uniform_float4_count += 2;
    } // if
} // emit_ARB1_sampler

// !!! FIXME: a lot of cut-and-paste here from emit_GLSL_attribute().
static void emit_ARB1_attribute(Context *ctx, RegisterType regtype, int regnum,
                                MOJOSHADER_usage usage, int index, int wmask,
                                int flags)
{
    // !!! FIXME: this function doesn't deal with write masks at all yet!
    const char *usage_str = NULL;
    const char *arrayleft = "";
    const char *arrayright = "";
    char index_str[16] = { '\0' };

    char varname[64];
    get_ARB1_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));

    //assert((flags & MOD_PP) == 0);  // !!! FIXME: is PP allowed?

    if (index != 0)  // !!! FIXME: a lot of these MUST be zero.
        snprintf(index_str, sizeof (index_str), "%u", (uint) index);

    if (shader_is_vertex(ctx))
    {
        // pre-vs3 output registers.
        // these don't ever happen in DCL opcodes, I think. Map to vs_3_*
        //  output registers.
        if (!shader_version_atleast(ctx, 3, 0))
        {
            if (regtype == REG_TYPE_RASTOUT)
            {
                regtype = REG_TYPE_OUTPUT;
                index = regnum;
                switch ((const RastOutType) regnum)
                {
                    case RASTOUT_TYPE_POSITION:
                        usage = MOJOSHADER_USAGE_POSITION;
                        break;
                    case RASTOUT_TYPE_FOG:
                        usage = MOJOSHADER_USAGE_FOG;
                        break;
                    case RASTOUT_TYPE_POINT_SIZE:
                        usage = MOJOSHADER_USAGE_POINTSIZE;
                        break;
                } // switch
            } // if

            else if (regtype == REG_TYPE_ATTROUT)
            {
                regtype = REG_TYPE_OUTPUT;
                usage = MOJOSHADER_USAGE_COLOR;
                index = regnum;
            } // else if

            else if (regtype == REG_TYPE_TEXCRDOUT)
            {
                regtype = REG_TYPE_OUTPUT;
                usage = MOJOSHADER_USAGE_TEXCOORD;
                index = regnum;
            } // else if
        } // if

        // to avoid limitations of various GL entry points for input
        // attributes (glSecondaryColorPointer() can only take 3 component
        // items, glVertexPointer() can't do GL_UNSIGNED_BYTE, many other
        // issues), we set up all inputs as generic vertex attributes, so we
        // can pass data in just about any form, and ignore the built-in GLSL
        // attributes like gl_SecondaryColor. Output needs to use the the
        // built-ins, though, but we don't have to worry about the GL entry
        // point limitations there.

        if (regtype == REG_TYPE_INPUT)
        {
            const int attr = ctx->assigned_vertex_attributes++;
            push_output(ctx, &ctx->globals);
            output_line(ctx, "ATTRIB %s = vertex.attrib[%d];", varname, attr);
            pop_output(ctx);
        } // if

        else if (regtype == REG_TYPE_OUTPUT)
        {
            switch (usage)
            {
                case MOJOSHADER_USAGE_POSITION:
                    ctx->arb1_wrote_position = 1;
                    usage_str = "result.position";
                    break;
                case MOJOSHADER_USAGE_POINTSIZE:
                    usage_str = "result.pointsize";
                    break;
                case MOJOSHADER_USAGE_COLOR:
                    index_str[0] = '\0';  // no explicit number.
                    if (index == 0)
                        usage_str = "result.color.primary";
                    else if (index == 1)
                        usage_str = "result.color.secondary";
                    break;
                case MOJOSHADER_USAGE_FOG:
                    usage_str = "result.fogcoord";
                    break;
                case MOJOSHADER_USAGE_TEXCOORD:
                    snprintf(index_str, sizeof (index_str), "%u", (uint) index);
                    usage_str = "result.texcoord";
                    arrayleft = "[";
                    arrayright = "]";
                    break;
                default:
                    // !!! FIXME: we need to deal with some more built-in varyings here.
                    break;
            } // switch

            // !!! FIXME: the #define is a little hacky, but it means we don't
            // !!! FIXME:  have to track these separately if this works.
            push_output(ctx, &ctx->globals);
            // no mapping to built-in var? Just make it a regular global, pray.
            if (usage_str == NULL)
                output_line(ctx, "%s %s;", arb1_float_temp(ctx), varname);
            else
            {
                output_line(ctx, "OUTPUT %s = %s%s%s%s;", varname, usage_str,
                            arrayleft, index_str, arrayright);
            } // else
            pop_output(ctx);
        } // else if

        else
        {
            fail(ctx, "unknown vertex shader attribute register");
        } // else
    } // if

    else if (shader_is_pixel(ctx))
    {
        const char *paramtype_str = "ATTRIB";

        // samplers DCLs get handled in emit_ARB1_sampler().

        if (flags & MOD_CENTROID)
        {
            if (!support_nv4(ctx))  // GL_NV_fragment_program4 adds centroid.
            {
                // !!! FIXME: should we just wing it without centroid here?
                failf(ctx, "centroid unsupported in %s profile",
                      ctx->profile->name);
                return;
            } // if

            paramtype_str = "CENTROID ATTRIB";
        } // if

        if (regtype == REG_TYPE_COLOROUT)
        {
            paramtype_str = "OUTPUT";
            usage_str = "result.color";
            if (ctx->have_multi_color_outputs)
            {
                // We have to gamble that you have GL_ARB_draw_buffers.
                // You probably do at this point if you have a sane setup.
                snprintf(index_str, sizeof (index_str), "%u", (uint) regnum);
                arrayleft = "[";
                arrayright = "]";
            } // if
        } // if

        else if (regtype == REG_TYPE_DEPTHOUT)
        {
            paramtype_str = "OUTPUT";
            usage_str = "result.depth";
        } // else if

        // !!! FIXME: can you actualy have a texture register with COLOR usage?
        else if ((regtype == REG_TYPE_TEXTURE) || (regtype == REG_TYPE_INPUT))
        {
            if (usage == MOJOSHADER_USAGE_TEXCOORD)
            {
                // ps_1_1 does a different hack for this attribute.
                //  Refer to emit_ARB1_global()'s REG_TYPE_TEXTURE code.
                if (shader_version_atleast(ctx, 1, 4))
                {
                    snprintf(index_str, sizeof (index_str), "%u", (uint) index);
                    usage_str = "fragment.texcoord";
                    arrayleft = "[";
                    arrayright = "]";
                } // if
            } // if

            else if (usage == MOJOSHADER_USAGE_COLOR)
            {
                index_str[0] = '\0';  // no explicit number.
                if (index == 0)
                    usage_str = "fragment.color.primary";
                else if (index == 1)
                    usage_str = "fragment.color.secondary";
                else
                    fail(ctx, "unsupported color index");
            } // else if
        } // else if

        else if (regtype == REG_TYPE_MISCTYPE)
        {
            const MiscTypeType mt = (MiscTypeType) regnum;
            if (mt == MISCTYPE_TYPE_FACE)
            {
                if (support_nv4(ctx))  // FINALLY, a vFace equivalent in nv4!
                {
                    index_str[0] = '\0';  // no explicit number.
                    usage_str = "fragment.facing";
                } // if
                else
                {
                    failf(ctx, "vFace unsupported in %s profile",
                          ctx->profile->name);
                } // else
            } // if
            else if (mt == MISCTYPE_TYPE_POSITION)
            {
                index_str[0] = '\0';  // no explicit number.
                usage_str = "fragment.position";  // !!! FIXME: is this the same coord space as D3D?
            } // else if
            else
            {
                fail(ctx, "BUG: unhandled misc register");
            } // else
        } // else if

        else
        {
            fail(ctx, "unknown pixel shader attribute register");
        } // else

        if (usage_str != NULL)
        {
            push_output(ctx, &ctx->globals);
            output_line(ctx, "%s %s = %s%s%s%s;", paramtype_str, varname,
                        usage_str, arrayleft, index_str, arrayright);
            pop_output(ctx);
        } // if
    } // else if

    else
    {
        fail(ctx, "Unknown shader type");  // state machine should catch this.
    } // else
} // emit_ARB1_attribute

static void emit_ARB1_RESERVED(Context *ctx) { /* no-op. */ }

static void emit_ARB1_NOP(Context *ctx)
{
    // There is no NOP in arb1. Just don't output anything here.
} // emit_ARB1_NOP

EMIT_ARB1_OPCODE_DS_FUNC(MOV)
EMIT_ARB1_OPCODE_DSS_FUNC(ADD)
EMIT_ARB1_OPCODE_DSS_FUNC(SUB)
EMIT_ARB1_OPCODE_DSSS_FUNC(MAD)
EMIT_ARB1_OPCODE_DSS_FUNC(MUL)
EMIT_ARB1_OPCODE_DS_FUNC(RCP)

static void emit_ARB1_RSQ(Context *ctx)
{
    // nv4 doesn't force abs() on this, so negative values will generate NaN.
    // The spec says you should force the abs() yourself.
    if (!support_nv4(ctx))
    {
        emit_ARB1_opcode_ds(ctx, "RSQ");  // pre-nv4 implies ABS.
        return;
    } // if

    // we can optimize this to use nv2's |abs| construct in some cases.
    if ( (ctx->source_args[0].src_mod == SRCMOD_NONE) ||
         (ctx->source_args[0].src_mod == SRCMOD_NEGATE) ||
         (ctx->source_args[0].src_mod == SRCMOD_ABSNEGATE) )
        ctx->source_args[0].src_mod = SRCMOD_ABS;

    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));

    if (ctx->source_args[0].src_mod == SRCMOD_ABS)
        output_line(ctx, "RSQ%s, %s;", dst, src0);
    else
    {
        char buf[64]; allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));
        output_line(ctx, "ABS %s, %s;", buf, src0);
        output_line(ctx, "RSQ%s, %s.x;", dst, buf);
    } // else

    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_RSQ

EMIT_ARB1_OPCODE_DSS_FUNC(DP3)
EMIT_ARB1_OPCODE_DSS_FUNC(DP4)
EMIT_ARB1_OPCODE_DSS_FUNC(MIN)
EMIT_ARB1_OPCODE_DSS_FUNC(MAX)
EMIT_ARB1_OPCODE_DSS_FUNC(SLT)
EMIT_ARB1_OPCODE_DSS_FUNC(SGE)

static void emit_ARB1_EXP(Context *ctx) { emit_ARB1_opcode_ds(ctx, "EX2"); }

static void arb1_log(Context *ctx, const char *opcode)
{
    // !!! FIXME: SRCMOD_NEGATE can be made into SRCMOD_ABS here, too
    // we can optimize this to use nv2's |abs| construct in some cases.
    if ( (ctx->source_args[0].src_mod == SRCMOD_NONE) ||
         (ctx->source_args[0].src_mod == SRCMOD_ABSNEGATE) )
        ctx->source_args[0].src_mod = SRCMOD_ABS;

    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));

    if (ctx->source_args[0].src_mod == SRCMOD_ABS)
        output_line(ctx, "%s%s, %s;", opcode, dst, src0);
    else
    {
        char buf[64]; allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));
        output_line(ctx, "ABS %s, %s;", buf, src0);
        output_line(ctx, "%s%s, %s.x;", opcode, dst, buf);
    } // else

    emit_ARB1_dest_modifiers(ctx);
} // arb1_log


static void emit_ARB1_LOG(Context *ctx)
{
    arb1_log(ctx, "LG2");
} // emit_ARB1_LOG


EMIT_ARB1_OPCODE_DS_FUNC(LIT)
EMIT_ARB1_OPCODE_DSS_FUNC(DST)

static void emit_ARB1_LRP(Context *ctx)
{
    if (shader_is_pixel(ctx))  // fragment shaders have a matching LRP opcode.
        emit_ARB1_opcode_dsss(ctx, "LRP");
    else
    {
        char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));
        char src2[64]; make_ARB1_srcarg_string(ctx, 2, src2, sizeof (src2));
        char buf[64]; allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));

        // LRP is: dest = src2 + src0 * (src1 - src2)
        output_line(ctx, "SUB %s, %s, %s;", buf, src1, src2);
        output_line(ctx, "MAD%s, %s, %s, %s;", dst, buf, src0, src2);
        emit_ARB1_dest_modifiers(ctx);
    } // else
} // emit_ARB1_LRP

EMIT_ARB1_OPCODE_DS_FUNC(FRC)

static void arb1_MxXy(Context *ctx, const int x, const int y)
{
    DestArgInfo *dstarg = &ctx->dest_arg;
    const int origmask = dstarg->writemask;
    char src0[64];
    int i;

    make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));

    for (i = 0; i < y; i++)
    {
        char dst[64];
        char row[64];
        make_ARB1_srcarg_string(ctx, i + 1, row, sizeof (row));
        set_dstarg_writemask(dstarg, 1 << i);
        make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        output_line(ctx, "DP%d%s, %s, %s;", x, dst, src0, row);
    } // for

    set_dstarg_writemask(dstarg, origmask);
    emit_ARB1_dest_modifiers(ctx);
} // arb1_MxXy

static void emit_ARB1_M4X4(Context *ctx) { arb1_MxXy(ctx, 4, 4); }
static void emit_ARB1_M4X3(Context *ctx) { arb1_MxXy(ctx, 4, 3); }
static void emit_ARB1_M3X4(Context *ctx) { arb1_MxXy(ctx, 3, 4); }
static void emit_ARB1_M3X3(Context *ctx) { arb1_MxXy(ctx, 3, 3); }
static void emit_ARB1_M3X2(Context *ctx) { arb1_MxXy(ctx, 3, 2); }

static void emit_ARB1_CALL(Context *ctx)
{
    if (!support_nv2(ctx))  // no branching in stock ARB1.
    {
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
        return;
    } // if

    char labelstr[64];
    get_ARB1_srcarg_varname(ctx, 0, labelstr, sizeof (labelstr));
    output_line(ctx, "CAL %s;", labelstr);
} // emit_ARB1_CALL

static void emit_ARB1_CALLNZ(Context *ctx)
{
    // !!! FIXME: if src1 is a constbool that's true, we can remove the
    // !!! FIXME:  if. If it's false, we can make this a no-op.

    if (!support_nv2(ctx))  // no branching in stock ARB1.
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
    else
    {
        // !!! FIXME: double-check this.
        char labelstr[64];
        char scratch[64];
        char src1[64];
        get_ARB1_srcarg_varname(ctx, 0, labelstr, sizeof (labelstr));
        get_ARB1_srcarg_varname(ctx, 1, src1, sizeof (src1));
        allocate_ARB1_scratch_reg_name(ctx, scratch, sizeof (scratch));
        output_line(ctx, "MOVC %s, %s;", scratch, src1);
        output_line(ctx, "CAL %s (NE.x);", labelstr);
    } // else
} // emit_ARB1_CALLNZ

// !!! FIXME: needs BRA in nv2, LOOP in nv2 fragment progs, and REP in nv4.
EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(LOOP)

static void emit_ARB1_RET(Context *ctx)
{
    // don't fail() if no nv2...maybe we're just ending the mainline?
    //  if we're ending a LABEL that had no CALL, this would all be written
    //  to ctx->ignore anyhow, so this should be "safe" ... arb1 profile will
    //  just end up throwing all this code out.
    if (support_nv2(ctx))  // no branching in stock ARB1.
        output_line(ctx, "RET;");
    set_output(ctx, &ctx->mainline); // in case we were ignoring this function.
} // emit_ARB1_RET


EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(ENDLOOP)

static void emit_ARB1_LABEL(Context *ctx)
{
    if (!support_nv2(ctx))  // no branching in stock ARB1.
        return;  // don't fail()...maybe we never use it, but do fail in CALL.

    const int label = ctx->source_args[0].regnum;
    RegisterList *reg = reglist_find(&ctx->used_registers, REG_TYPE_LABEL, label);

    // MSDN specs say CALL* has to come before the LABEL, so we know if we
    //  can ditch the entire function here as unused.
    if (reg == NULL)
        set_output(ctx, &ctx->ignore);  // Func not used. Parse, but don't output.

    // !!! FIXME: it would be nice if we could determine if a function is
    // !!! FIXME:  only called once and, if so, forcibly inline it.

    //const char *uses_loopreg = ((reg) && (reg->misc == 1)) ? "int aL" : "";
    char labelstr[64];
    get_ARB1_srcarg_varname(ctx, 0, labelstr, sizeof (labelstr));
    output_line(ctx, "%s:", labelstr);
} // emit_ARB1_LABEL


static void emit_ARB1_POW(Context *ctx)
{
    // we can optimize this to use nv2's |abs| construct in some cases.
    if ( (ctx->source_args[0].src_mod == SRCMOD_NONE) ||
         (ctx->source_args[0].src_mod == SRCMOD_ABSNEGATE) )
        ctx->source_args[0].src_mod = SRCMOD_ABS;

    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));

    if (ctx->source_args[0].src_mod == SRCMOD_ABS)
        output_line(ctx, "POW%s, %s, %s;", dst, src0, src1);
    else
    {
        char buf[64]; allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));
        output_line(ctx, "ABS %s, %s;", buf, src0);
        output_line(ctx, "POW%s, %s.x, %s;", dst, buf, src1);
    } // else

    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_POW

static void emit_ARB1_CRS(Context *ctx) { emit_ARB1_opcode_dss(ctx, "XPD"); }

static void emit_ARB1_SGN(Context *ctx)
{
    if (support_nv2(ctx))
        emit_ARB1_opcode_ds(ctx, "SSG");
    else
    {
        char dst[64];
        char src0[64];
        char scratch1[64];
        char scratch2[64];
        make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        allocate_ARB1_scratch_reg_name(ctx, scratch1, sizeof (scratch1));
        allocate_ARB1_scratch_reg_name(ctx, scratch2, sizeof (scratch2));
        output_line(ctx, "SLT %s, %s, 0.0;", scratch1, src0);
        output_line(ctx, "SLT %s, -%s, 0.0;", scratch2, src0);
        output_line(ctx, "ADD%s -%s, %s;", dst, scratch1, scratch2);
        emit_ARB1_dest_modifiers(ctx);
    } // else
} // emit_ARB1_SGN

EMIT_ARB1_OPCODE_DS_FUNC(ABS)

static void emit_ARB1_NRM(Context *ctx)
{
    // nv2 fragment programs (and anything nv4) have a real NRM.
    if ( (support_nv4(ctx)) || ((support_nv2(ctx)) && (shader_is_pixel(ctx))) )
        emit_ARB1_opcode_ds(ctx, "NRM");
    else
    {
        char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        char buf[64]; allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));
        output_line(ctx, "DP3 %s.w, %s, %s;", buf, src0, src0);
        output_line(ctx, "RSQ %s.w, %s.w;", buf, buf);
        output_line(ctx, "MUL%s, %s.w, %s;", dst, buf, src0);
        emit_ARB1_dest_modifiers(ctx);
    } // else
} // emit_ARB1_NRM


static void emit_ARB1_SINCOS(Context *ctx)
{
    // we don't care about the temp registers that <= sm2 demands; ignore them.
    const int mask = ctx->dest_arg.writemask;

    // arb1 fragment programs and everything nv4 have sin/cos/sincos opcodes.
    if ((shader_is_pixel(ctx)) || (support_nv4(ctx)))
    {
        char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        if (writemask_x(mask))
            output_line(ctx, "COS%s, %s;", dst, src0);
        else if (writemask_y(mask))
            output_line(ctx, "SIN%s, %s;", dst, src0);
        else if (writemask_xy(mask))
            output_line(ctx, "SCS%s, %s;", dst, src0);
    } // if

    // nv2+ profiles have sin and cos opcodes.
    else if (support_nv2(ctx))
    {
        char dst[64]; get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
        char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        if (writemask_x(mask))
            output_line(ctx, "COS %s.x, %s;", dst, src0);
        else if (writemask_y(mask))
            output_line(ctx, "SIN %s.y, %s;", dst, src0);
        else if (writemask_xy(mask))
        {
            output_line(ctx, "SIN %s.x, %s;", dst, src0);
            output_line(ctx, "COS %s.y, %s;", dst, src0);
        } // else if
    } // if

    else  // big nasty.
    {
        char dst[64]; get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
        char src0[64]; get_ARB1_srcarg_varname(ctx, 0, src0, sizeof (src0));
        const int need_sin = (writemask_x(mask) || writemask_xy(mask));
        const int need_cos = (writemask_y(mask) || writemask_xy(mask));
        char scratch[64];

        if (need_sin || need_cos)
            allocate_ARB1_scratch_reg_name(ctx, scratch, sizeof (scratch));

        // These sin() and cos() approximations originally found here:
        //    http://www.devmaster.net/forums/showthread.php?t=5784
        //
        // const float B = 4.0f / M_PI;
        // const float C = -4.0f / (M_PI * M_PI);
        // float y = B * x + C * x * fabs(x);
        //
        // // optional better precision...
        // const float P = 0.225f;
        // y = P * (y * fabs(y) - y) + y;
        //
        //
        // That first thing can be reduced to:
        // const float y = ((1.2732395447351626861510701069801f * x) +
        //             ((-0.40528473456935108577551785283891f * x) * fabs(x)));

        if (need_sin)
        {
            // !!! FIXME: use SRCMOD_ABS here?
            output_line(ctx, "ABS %s.x, %s.x;", dst, src0);
            output_line(ctx, "MUL %s.x, %s.x, -0.40528473456935108577551785283891;", dst, dst);
            output_line(ctx, "MUL %s.x, %s.x, 1.2732395447351626861510701069801;", scratch, src0);
            output_line(ctx, "MAD %s.x, %s.x, %s.x, %s.x;", dst, dst, src0, scratch);
        } // if

        // cosine is sin(x + M_PI/2), but you have to wrap x to pi:
        //  if (x+(M_PI/2) > M_PI)
        //      x -= 2 * M_PI;
        //
        // which is...
        //  if (x+(1.57079637050628662109375) > 3.1415927410125732421875)
        //      x += -6.283185482025146484375;

        if (need_cos)
        {
            output_line(ctx, "ADD %s.x, %s.x, 1.57079637050628662109375;", scratch, src0);
            output_line(ctx, "SGE %s.y, %s.x, 3.1415927410125732421875;", scratch, scratch);
            output_line(ctx, "MAD %s.x, %s.y, -6.283185482025146484375, %s.x;", scratch, scratch, scratch);
            output_line(ctx, "ABS %s.x, %s.x;", dst, src0);
            output_line(ctx, "MUL %s.x, %s.x, -0.40528473456935108577551785283891;", dst, dst);
            output_line(ctx, "MUL %s.x, %s.x, 1.2732395447351626861510701069801;", scratch, src0);
            output_line(ctx, "MAD %s.y, %s.x, %s.x, %s.x;", dst, dst, src0, scratch);
        } // if
    } // else

    // !!! FIXME: might not have done anything. Don't emit if we didn't.
    if (!isfail(ctx))
        emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_SINCOS


static void emit_ARB1_REP(Context *ctx)
{
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));

    // nv2 fragment programs (and everything nv4) have a real REP.
    if ( (support_nv4(ctx)) || ((support_nv2(ctx)) && (shader_is_pixel(ctx))) )
        output_line(ctx, "REP %s;", src0);

    else if (support_nv2(ctx))
    {
        // no REP, but we can use branches.
        char failbranch[32];
        char topbranch[32];
        const int toplabel = allocate_branch_label(ctx);
        const int faillabel = allocate_branch_label(ctx);
        get_ARB1_branch_label_name(ctx,faillabel,failbranch,sizeof(failbranch));
        get_ARB1_branch_label_name(ctx,toplabel,topbranch,sizeof(topbranch));

        assert(((size_t) ctx->branch_labels_stack_index) <
                STATICARRAYLEN(ctx->branch_labels_stack)-1);

        ctx->branch_labels_stack[ctx->branch_labels_stack_index++] = toplabel;
        ctx->branch_labels_stack[ctx->branch_labels_stack_index++] = faillabel;

        char scratch[32];
        snprintf(scratch, sizeof (scratch), "rep%d", ctx->reps);
        output_line(ctx, "MOVC %s.x, %s;", scratch, src0);
        output_line(ctx, "BRA %s (LE.x);", failbranch);
        output_line(ctx, "%s:", topbranch);
    } // else if

    else  // stock ARB1 has no branching.
    {
        fail(ctx, "branching unsupported in this profile");
    } // else
} // emit_ARB1_REP


static void emit_ARB1_ENDREP(Context *ctx)
{
    // nv2 fragment programs (and everything nv4) have a real ENDREP.
    if ( (support_nv4(ctx)) || ((support_nv2(ctx)) && (shader_is_pixel(ctx))) )
        output_line(ctx, "ENDREP;");

    else if (support_nv2(ctx))
    {
        // no ENDREP, but we can use branches.
        assert(ctx->branch_labels_stack_index >= 2);

        char failbranch[32];
        char topbranch[32];
        const int faillabel = ctx->branch_labels_stack[--ctx->branch_labels_stack_index];
        const int toplabel = ctx->branch_labels_stack[--ctx->branch_labels_stack_index];
        get_ARB1_branch_label_name(ctx,faillabel,failbranch,sizeof(failbranch));
        get_ARB1_branch_label_name(ctx,toplabel,topbranch,sizeof(topbranch));

        char scratch[32];
        snprintf(scratch, sizeof (scratch), "rep%d", ctx->reps);
        output_line(ctx, "SUBC %s.x, %s.x, 1.0;", scratch, scratch);
        output_line(ctx, "BRA %s (GT.x);", topbranch);
        output_line(ctx, "%s:", failbranch);
    } // else if

    else  // stock ARB1 has no branching.
    {
        fail(ctx, "branching unsupported in this profile");
    } // else
} // emit_ARB1_ENDREP


static void nv2_if(Context *ctx)
{
    // The condition code register MUST be set up before this!
    // nv2 fragment programs (and everything nv4) have a real IF.
    if ( (support_nv4(ctx)) || (shader_is_pixel(ctx)) )
        output_line(ctx, "IF EQ.x;");
    else
    {
        // there's no IF construct, but we can use a branch to a label.
        char failbranch[32];
        const int label = allocate_branch_label(ctx);
        get_ARB1_branch_label_name(ctx, label, failbranch, sizeof (failbranch));

        assert(((size_t) ctx->branch_labels_stack_index)
                 < STATICARRAYLEN(ctx->branch_labels_stack));

        ctx->branch_labels_stack[ctx->branch_labels_stack_index++] = label;

        // !!! FIXME: should this be NE? (EQ would jump to the ELSE for the IF condition, right?).
        output_line(ctx, "BRA %s (EQ.x);", failbranch);
    } // else
} // nv2_if


static void emit_ARB1_IF(Context *ctx)
{
    if (support_nv2(ctx))
    {
        char buf[64]; allocate_ARB1_scratch_reg_name(ctx, buf, sizeof (buf));
        char src0[64]; get_ARB1_srcarg_varname(ctx, 0, src0, sizeof (src0));
        output_line(ctx, "MOVC %s.x, %s;", buf, src0);
        nv2_if(ctx);
    } // if

    else  // stock ARB1 has no branching.
    {
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
    } // else
} // emit_ARB1_IF


static void emit_ARB1_ELSE(Context *ctx)
{
    // nv2 fragment programs (and everything nv4) have a real ELSE.
    if ( (support_nv4(ctx)) || ((support_nv2(ctx)) && (shader_is_pixel(ctx))) )
        output_line(ctx, "ELSE;");

    else if (support_nv2(ctx))
    {
        // there's no ELSE construct, but we can use a branch to a label.
        assert(ctx->branch_labels_stack_index > 0);

        // At the end of the IF block, unconditionally jump to the ENDIF.
        const int endlabel = allocate_branch_label(ctx);
        char endbranch[32];
        get_ARB1_branch_label_name(ctx,endlabel,endbranch,sizeof (endbranch));
        output_line(ctx, "BRA %s;", endbranch);

        // Now mark the ELSE section with a lable.
        const int elselabel = ctx->branch_labels_stack[ctx->branch_labels_stack_index-1];
        char elsebranch[32];
        get_ARB1_branch_label_name(ctx,elselabel,elsebranch,sizeof(elsebranch));
        output_line(ctx, "%s:", elsebranch);

        // Replace the ELSE label with the ENDIF on the label stack.
        ctx->branch_labels_stack[ctx->branch_labels_stack_index-1] = endlabel;
    } // else if

    else  // stock ARB1 has no branching.
    {
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
    } // else
} // emit_ARB1_ELSE


static void emit_ARB1_ENDIF(Context *ctx)
{
    // nv2 fragment programs (and everything nv4) have a real ENDIF.
    if ( (support_nv4(ctx)) || ((support_nv2(ctx)) && (shader_is_pixel(ctx))) )
        output_line(ctx, "ENDIF;");

    else if (support_nv2(ctx))
    {
        // there's no ENDIF construct, but we can use a branch to a label.
        assert(ctx->branch_labels_stack_index > 0);
        const int endlabel = ctx->branch_labels_stack[--ctx->branch_labels_stack_index];
        char endbranch[32];
        get_ARB1_branch_label_name(ctx,endlabel,endbranch,sizeof (endbranch));
        output_line(ctx, "%s:", endbranch);
    } // if

    else  // stock ARB1 has no branching.
    {
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
    } // else
} // emit_ARB1_ENDIF


static void emit_ARB1_BREAK(Context *ctx)
{
    // nv2 fragment programs (and everything nv4) have a real BREAK.
    if ( (support_nv4(ctx)) || ((support_nv2(ctx)) && (shader_is_pixel(ctx))) )
        output_line(ctx, "BRK;");

    else if (support_nv2(ctx))
    {
        // no BREAK, but we can use branches.
        assert(ctx->branch_labels_stack_index >= 2);
        const int faillabel = ctx->branch_labels_stack[ctx->branch_labels_stack_index];
        char failbranch[32];
        get_ARB1_branch_label_name(ctx,faillabel,failbranch,sizeof(failbranch));
        output_line(ctx, "BRA %s;", failbranch);
    } // else if

    else  // stock ARB1 has no branching.
    {
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
    } // else
} // emit_ARB1_BREAK


static void emit_ARB1_MOVA(Context *ctx)
{
    // nv2 and nv3 can use the ARR opcode.
    // But nv4 removed ARR (and ADDRESS registers!). Just ROUND to an INT.
    if (support_nv4(ctx))
        emit_ARB1_opcode_ds(ctx, "ROUND.S");  // !!! FIXME: don't use a modifier here.
    else if ((support_nv2(ctx)) || (support_nv3(ctx)))
        emit_ARB1_opcode_ds(ctx, "ARR");
    else
    {
        char src0[64];
        char scratch[64];
        char addr[32];

        make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        allocate_ARB1_scratch_reg_name(ctx, scratch, sizeof (scratch));
        snprintf(addr, sizeof (addr), "addr%d", ctx->dest_arg.regnum);

        // !!! FIXME: we can optimize this if src_mod is ABS or ABSNEGATE.

        // ARL uses floor(), but D3D expects round-to-nearest.
        // There is probably a more efficient way to do this.
        if (shader_is_pixel(ctx))  // CMP only exists in fragment programs.  :/
            output_line(ctx, "CMP %s, %s, -1.0, 1.0;", scratch, src0);
        else
        {
            output_line(ctx, "SLT %s, %s, 0.0;", scratch, src0);
            output_line(ctx, "MAD %s, %s, -2.0, 1.0;", scratch, scratch);
        } // else

        output_line(ctx, "ABS %s, %s;", addr, src0);
        output_line(ctx, "ADD %s, %s, 0.5;", addr, addr);
        output_line(ctx, "FLR %s, %s;", addr, addr);
        output_line(ctx, "MUL %s, %s, %s;", addr, addr, scratch);

        // we don't handle these right now, since emit_ARB1_dest_modifiers(ctx)
        //  wants to look at dest_arg, not our temp register.
        assert(ctx->dest_arg.result_mod == 0);
        assert(ctx->dest_arg.result_shift == 0);

        // we assign to the actual address register as needed.
        ctx->last_address_reg_component = -1;
    } // else
} // emit_ARB1_MOVA


static void emit_ARB1_TEXKILL(Context *ctx)
{
    // d3d kills on xyz, arb1 kills on xyzw. Fix the swizzle.
    //  We just map the x component to w. If it's negative, the fragment
    //  would discard anyhow, otherwise, it'll pass through okay. This saves
    //  us a temp register.
    char dst[64];
    get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
    output_line(ctx, "KIL %s.xyzx;", dst);
} // emit_ARB1_TEXKILL

static void arb1_texbem(Context *ctx, const int luminance)
{
    // !!! FIXME: this code counts on the register not having swizzles, etc.
    const int stage = ctx->dest_arg.regnum;
    char dst[64]; get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
    char src[64]; get_ARB1_srcarg_varname(ctx, 0, src, sizeof (src));
    char tmp[64]; allocate_ARB1_scratch_reg_name(ctx, tmp, sizeof (tmp));
    char sampler[64];
    get_ARB1_varname_in_buf(ctx, REG_TYPE_SAMPLER, stage,
                            sampler, sizeof (sampler));

    output_line(ctx, "MUL %s, %s_texbem.xzyw, %s.xyxy;", tmp, sampler, src);
    output_line(ctx, "ADD %s.xy, %s.xzxx, %s.ywxx;", tmp, tmp, tmp);
    output_line(ctx, "ADD %s.xy, %s, %s;", tmp, tmp, dst);
    output_line(ctx, "TEX %s, %s, texture[%d], 2D;", dst, tmp, stage);

    if (luminance)  // TEXBEML, not just TEXBEM?
    {
        output_line(ctx, "MAD %s, %s.zzzz, %s_texbeml.xxxx, %s_texbeml.yyyy;",
                    tmp, src, sampler, sampler);
        output_line(ctx, "MUL %s, %s, %s;", dst, dst, tmp);
    } // if

    emit_ARB1_dest_modifiers(ctx);
} // arb1_texbem

static void emit_ARB1_TEXBEM(Context *ctx)
{
    arb1_texbem(ctx, 0);
} // emit_ARB1_TEXBEM

static void emit_ARB1_TEXBEML(Context *ctx)
{
    arb1_texbem(ctx, 1);
} // emit_ARB1_TEXBEML

EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2AR)
EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2GB)


static void emit_ARB1_TEXM3X2PAD(Context *ctx)
{
    // no-op ... work happens in emit_ARB1_TEXM3X2TEX().
} // emit_ARB1_TEXM3X2PAD

static void emit_ARB1_TEXM3X2TEX(Context *ctx)
{
    if (ctx->texm3x2pad_src0 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    const int stage = ctx->dest_arg.regnum;
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_src0,
                            src0, sizeof (src0));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_dst0,
                            src1, sizeof (src1));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src2, sizeof (src2));
    get_ARB1_destarg_varname(ctx, dst, sizeof (dst));

    output_line(ctx, "DP3 %s.y, %s, %s;", dst, src2, dst);
    output_line(ctx, "DP3 %s.x, %s, %s;", dst, src0, src1);
    output_line(ctx, "TEX %s, %s, texture[%d], 2D;", dst, dst, stage);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_TEXM3X2TEX


static void emit_ARB1_TEXM3X3PAD(Context *ctx)
{
    // no-op ... work happens in emit_ARB1_TEXM3X3*().
} // emit_ARB1_TEXM3X3PAD


static void emit_ARB1_TEXM3X3TEX(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    const int stage = ctx->dest_arg.regnum;
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_ARB1_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, stage);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "CUBE" : "3D";

    output_line(ctx, "DP3 %s.z, %s, %s;", dst, dst, src4);
    output_line(ctx, "DP3 %s.x, %s, %s;", dst, src0, src1);
    output_line(ctx, "DP3 %s.y, %s, %s;", dst, src2, src3);
    output_line(ctx, "TEX %s, %s, texture[%d], %s;", dst, dst, stage, ttypestr);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_TEXM3X3TEX

static void emit_ARB1_TEXM3X3SPEC(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char src5[64];
    char tmp[64];
    char tmp2[64];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    const int stage = ctx->dest_arg.regnum;
    allocate_ARB1_scratch_reg_name(ctx, tmp, sizeof (tmp));
    allocate_ARB1_scratch_reg_name(ctx, tmp2, sizeof (tmp2));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[1].regnum,
                            src5, sizeof (src5));
    get_ARB1_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, stage);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "CUBE" : "3D";

    output_line(ctx, "DP3 %s.z, %s, %s;", dst, dst, src4);
    output_line(ctx, "DP3 %s.x, %s, %s;", dst, src0, src1);
    output_line(ctx, "DP3 %s.y, %s, %s;", dst, src2, src3);
    output_line(ctx, "MUL %s, %s, %s;", tmp, dst, dst);    // normal * normal
    output_line(ctx, "MUL %s, %s, %s;", tmp2, dst, src5);  // normal * eyeray

    // !!! FIXME: This is goofy. There's got to be a way to do vector-wide
    // !!! FIXME:  divides or reciprocals...right?
    output_line(ctx, "RCP %s.x, %s.x;", tmp2, tmp2);
    output_line(ctx, "RCP %s.y, %s.y;", tmp2, tmp2);
    output_line(ctx, "RCP %s.z, %s.z;", tmp2, tmp2);
    output_line(ctx, "RCP %s.w, %s.w;", tmp2, tmp2);
    output_line(ctx, "MUL %s, %s, %s;", tmp, tmp, tmp2);

    output_line(ctx, "MUL %s, %s, { 2.0, 2.0, 2.0, 2.0 };", tmp, tmp);
    output_line(ctx, "MAD %s, %s, %s, -%s;", tmp, tmp, dst, src5);
    output_line(ctx, "TEX %s, %s, texture[%d], %s;", dst, tmp, stage, ttypestr);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_TEXM3X3SPEC

static void emit_ARB1_TEXM3X3VSPEC(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];
    char tmp[64];
    char tmp2[64];
    char tmp3[64];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    const int stage = ctx->dest_arg.regnum;
    allocate_ARB1_scratch_reg_name(ctx, tmp, sizeof (tmp));
    allocate_ARB1_scratch_reg_name(ctx, tmp2, sizeof (tmp2));
    allocate_ARB1_scratch_reg_name(ctx, tmp3, sizeof (tmp3));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_ARB1_destarg_varname(ctx, dst, sizeof (dst));

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, stage);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
    const char *ttypestr = (ttype == TEXTURE_TYPE_CUBE) ? "CUBE" : "3D";

    output_line(ctx, "MOV %s.x, %s.w;", tmp3, src0);
    output_line(ctx, "MOV %s.y, %s.w;", tmp3, src2);
    output_line(ctx, "MOV %s.z, %s.w;", tmp3, dst);
    output_line(ctx, "DP3 %s.z, %s, %s;", dst, dst, src4);
    output_line(ctx, "DP3 %s.x, %s, %s;", dst, src0, src1);
    output_line(ctx, "DP3 %s.y, %s, %s;", dst, src2, src3);
    output_line(ctx, "MUL %s, %s, %s;", tmp, dst, dst);    // normal * normal
    output_line(ctx, "MUL %s, %s, %s;", tmp2, dst, tmp3);  // normal * eyeray

    // !!! FIXME: This is goofy. There's got to be a way to do vector-wide
    // !!! FIXME:  divides or reciprocals...right?
    output_line(ctx, "RCP %s.x, %s.x;", tmp2, tmp2);
    output_line(ctx, "RCP %s.y, %s.y;", tmp2, tmp2);
    output_line(ctx, "RCP %s.z, %s.z;", tmp2, tmp2);
    output_line(ctx, "RCP %s.w, %s.w;", tmp2, tmp2);
    output_line(ctx, "MUL %s, %s, %s;", tmp, tmp, tmp2);

    output_line(ctx, "MUL %s, %s, { 2.0, 2.0, 2.0, 2.0 };", tmp, tmp);
    output_line(ctx, "MAD %s, %s, %s, -%s;", tmp, tmp, dst, tmp3);
    output_line(ctx, "TEX %s, %s, texture[%d], %s;", dst, tmp, stage, ttypestr);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_TEXM3X3VSPEC

static void emit_ARB1_EXPP(Context *ctx) { emit_ARB1_opcode_ds(ctx, "EX2"); }
static void emit_ARB1_LOGP(Context *ctx) { arb1_log(ctx, "LG2"); }

static void emit_ARB1_CND(Context *ctx)
{
    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_ARB1_srcarg_string(ctx, 2, src2, sizeof (src2));
    char tmp[64]; allocate_ARB1_scratch_reg_name(ctx, tmp, sizeof (tmp));

    // CND compares against 0.5, but we need to compare against 0.0...
    //  ...subtract to make up the difference.
    output_line(ctx, "SUB %s, %s, { 0.5, 0.5, 0.5, 0.5 };", tmp, src0);
    // D3D tests (src0 >= 0.0), but ARB1 tests (src0 < 0.0) ... so just
    //  switch src1 and src2 to get the same results.
    output_line(ctx, "CMP%s, %s, %s, %s;", dst, tmp, src2, src1);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_CND

EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2RGB)
EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3TEX)
EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXM3X2DEPTH)
EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3)

static void emit_ARB1_TEXM3X3(Context *ctx)
{
    if (ctx->texm3x3pad_src1 == -1)
        return;

    char dst[64];
    char src0[64];
    char src1[64];
    char src2[64];
    char src3[64];
    char src4[64];

    // !!! FIXME: this code counts on the register not having swizzles, etc.
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
                            src0, sizeof (src0));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
                            src1, sizeof (src1));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
                            src2, sizeof (src2));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
                            src3, sizeof (src3));
    get_ARB1_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
                            src4, sizeof (src4));
    get_ARB1_destarg_varname(ctx, dst, sizeof (dst));

    output_line(ctx, "DP3 %s.z, %s, %s;", dst, dst, src4);
    output_line(ctx, "DP3 %s.x, %s, %s;", dst, src0, src1);
    output_line(ctx, "DP3 %s.y, %s, %s;", dst, src2, src3);
    output_line(ctx, "MOV %s.w, { 1.0, 1.0, 1.0, 1.0 };", dst);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_TEXM3X3

EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXDEPTH)

static void emit_ARB1_CMP(Context *ctx)
{
    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
    char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
    char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));
    char src2[64]; make_ARB1_srcarg_string(ctx, 2, src2, sizeof (src2));
    // D3D tests (src0 >= 0.0), but ARB1 tests (src0 < 0.0) ... so just
    //  switch src1 and src2 to get the same results.
    output_line(ctx, "CMP%s, %s, %s, %s;", dst, src0, src2, src1);
    emit_ARB1_dest_modifiers(ctx);
} // emit_ARB1_CMP

EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(BEM)


static void emit_ARB1_DP2ADD(Context *ctx)
{
    if (support_nv4(ctx))  // nv4 has a built-in equivalent to DP2ADD.
        emit_ARB1_opcode_dsss(ctx, "DP2A");
    else
    {
        char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));
        char src0[64]; make_ARB1_srcarg_string(ctx, 0, src0, sizeof (src0));
        char src1[64]; make_ARB1_srcarg_string(ctx, 1, src1, sizeof (src1));
        char src2[64]; make_ARB1_srcarg_string(ctx, 2, src2, sizeof (src2));
        char scratch[64];

        // DP2ADD is:
        //  dst = (src0.r * src1.r) + (src0.g * src1.g) + src2.replicate_swiz
        allocate_ARB1_scratch_reg_name(ctx, scratch, sizeof (scratch));
        output_line(ctx, "MUL %s, %s, %s;", scratch, src0, src1);
        output_line(ctx, "ADD %s, %s.x, %s.y;", scratch, scratch, scratch);
        output_line(ctx, "ADD%s, %s.x, %s;", dst, scratch, src2);
        emit_ARB1_dest_modifiers(ctx);
    } // else
} // emit_ARB1_DP2ADD


static void emit_ARB1_DSX(Context *ctx)
{
    if (support_nv2(ctx))  // nv2 has a built-in equivalent to DSX.
        emit_ARB1_opcode_ds(ctx, "DDX");
    else
        failf(ctx, "DSX unsupported in %s profile", ctx->profile->name);
} // emit_ARB1_DSX


static void emit_ARB1_DSY(Context *ctx)
{
    if (support_nv2(ctx))  // nv2 has a built-in equivalent to DSY.
        emit_ARB1_opcode_ds(ctx, "DDY");
    else
        failf(ctx, "DSY unsupported in %s profile", ctx->profile->name);
} // emit_ARB1_DSY

static void arb1_texld(Context *ctx, const char *opcode, const int texldd)
{
    // !!! FIXME: Hack: "TEXH" is invalid in nv4. Fix this more cleanly.
    if ((ctx->dest_arg.result_mod & MOD_PP) && (support_nv4(ctx)))
        ctx->dest_arg.result_mod &= ~MOD_PP;

    char dst[64]; make_ARB1_destarg_string(ctx, dst, sizeof (dst));

    const int sm1 = !shader_version_atleast(ctx, 1, 4);
    const int regnum = sm1 ? ctx->dest_arg.regnum : ctx->source_args[1].regnum;
    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, regnum);

    const char *ttype = NULL;
    char src0[64];
    if (sm1)
        get_ARB1_destarg_varname(ctx, src0, sizeof (src0));
    else
        get_ARB1_srcarg_varname(ctx, 0, src0, sizeof (src0));
    //char src1[64]; get_ARB1_srcarg_varname(ctx, 1, src1, sizeof (src1));  // !!! FIXME: SRC_MOD?

    char src2[64] = { 0 };
    char src3[64] = { 0 };

    if (texldd)
    {
        make_ARB1_srcarg_string(ctx, 2, src2, sizeof (src2));
        make_ARB1_srcarg_string(ctx, 3, src3, sizeof (src3));
    } // if

    // !!! FIXME: this should be in state_TEXLD, not in the arb1/glsl emitters.
    if (sreg == NULL)
    {
        fail(ctx, "TEXLD using undeclared sampler");
        return;
    } // if

    // SM1 only specifies dst, so don't check swizzle there.
    if ( !sm1 && (!no_swizzle(ctx->source_args[1].swizzle)) )
    {
        // !!! FIXME: does this ever actually happen?
        fail(ctx, "BUG: can't handle TEXLD with sampler swizzle at the moment");
    } // if

    switch ((const TextureType) sreg->index)
    {
        case TEXTURE_TYPE_2D: ttype = "2D"; break; // !!! FIXME: "RECT"?
        case TEXTURE_TYPE_CUBE: ttype = "CUBE"; break;
        case TEXTURE_TYPE_VOLUME: ttype = "3D"; break;
        default: fail(ctx, "unknown texture type"); return;
    } // switch

    if (texldd)
    {
        output_line(ctx, "%s%s, %s, %s, %s, texture[%d], %s;", opcode, dst,
                    src0, src2, src3, regnum, ttype);
    } // if
    else
    {
        output_line(ctx, "%s%s, %s, texture[%d], %s;", opcode, dst, src0,
                    regnum, ttype);
    } // else
} // arb1_texld


static void emit_ARB1_TEXLDD(Context *ctx)
{
    // With GL_NV_fragment_program2, we can use the TXD opcode.
    //  In stock arb1, we can settle for a standard texld, which isn't
    //  perfect, but oh well.
    if (support_nv2(ctx))
        arb1_texld(ctx, "TXD", 1);
    else
        arb1_texld(ctx, "TEX", 0);
} // emit_ARB1_TEXLDD


static void emit_ARB1_TEXLDL(Context *ctx)
{
    if ((shader_is_vertex(ctx)) && (!support_nv3(ctx)))
    {
        failf(ctx, "Vertex shader TEXLDL unsupported in %s profile",
              ctx->profile->name);
        return;
    } // if

    else if ((shader_is_pixel(ctx)) && (!support_nv2(ctx)))
    {
        failf(ctx, "Pixel shader TEXLDL unsupported in %s profile",
              ctx->profile->name);
        return;
    } // if

    // !!! FIXME: this doesn't map exactly to TEXLDL. Review this.
    arb1_texld(ctx, "TXL", 0);
} // emit_ARB1_TEXLDL


EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(BREAKP)
EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(BREAKC)

static void emit_ARB1_IFC(Context *ctx)
{
    if (support_nv2(ctx))
    {
        static const char *comps[] = {
            "", "SGTC", "SEQC", "SGEC", "SGTC", "SNEC", "SLEC"
        };

        if (ctx->instruction_controls >= STATICARRAYLEN(comps))
        {
            fail(ctx, "unknown comparison control");
            return;
        } // if

        char src0[64];
        char src1[64];
        char scratch[64];

        const char *comp = comps[ctx->instruction_controls];
        get_ARB1_srcarg_varname(ctx, 0, src0, sizeof (src0));
        get_ARB1_srcarg_varname(ctx, 1, src1, sizeof (src1));
        allocate_ARB1_scratch_reg_name(ctx, scratch, sizeof (scratch));
        output_line(ctx, "%s %s.x, %s, %s;", comp, scratch, src0, src1);
        nv2_if(ctx);
    } // if

    else  // stock ARB1 has no branching.
    {
        failf(ctx, "branching unsupported in %s profile", ctx->profile->name);
    } // else
} // emit_ARB1_IFC


EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(SETP)

static void emit_ARB1_DEF(Context *ctx)
{
    const float *val = (const float *) ctx->dwords; // !!! FIXME: could be int?
    char dst[64]; get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
    char val0[32]; floatstr(ctx, val0, sizeof (val0), val[0], 1);
    char val1[32]; floatstr(ctx, val1, sizeof (val1), val[1], 1);
    char val2[32]; floatstr(ctx, val2, sizeof (val2), val[2], 1);
    char val3[32]; floatstr(ctx, val3, sizeof (val3), val[3], 1);

    push_output(ctx, &ctx->globals);
    output_line(ctx, "PARAM %s = { %s, %s, %s, %s };",
                dst, val0, val1, val2, val3);
    pop_output(ctx);
} // emit_ARB1_DEF

static void emit_ARB1_DEFI(Context *ctx)
{
    char dst[64]; get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
    const int32 *x = (const int32 *) ctx->dwords;
    push_output(ctx, &ctx->globals);
    output_line(ctx, "PARAM %s = { %d, %d, %d, %d };",
                dst, (int) x[0], (int) x[1], (int) x[2], (int) x[3]);
    pop_output(ctx);
} // emit_ARB1_DEFI

static void emit_ARB1_DEFB(Context *ctx)
{
    char dst[64]; get_ARB1_destarg_varname(ctx, dst, sizeof (dst));
    push_output(ctx, &ctx->globals);
    output_line(ctx, "PARAM %s = %d;", dst, ctx->dwords[0] ? 1 : 0);
    pop_output(ctx);
} // emit_ARB1_DEFB

static void emit_ARB1_DCL(Context *ctx)
{
    // no-op. We do this in our emit_attribute() and emit_uniform().
} // emit_ARB1_DCL

EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXCRD)

static void emit_ARB1_TEXLD(Context *ctx)
{
    if (!shader_version_atleast(ctx, 1, 4))
    {
        arb1_texld(ctx, "TEX", 0);
        return;
    } // if

    else if (!shader_version_atleast(ctx, 2, 0))
    {
        // ps_1_4 is different, too!
        fail(ctx, "TEXLD == Shader Model 1.4 unimplemented.");  // !!! FIXME
        return;
    } // if

    // !!! FIXME: do texldb and texldp map between OpenGL and D3D correctly?
    if (ctx->instruction_controls == CONTROL_TEXLD)
        arb1_texld(ctx, "TEX", 0);
    else if (ctx->instruction_controls == CONTROL_TEXLDP)
        arb1_texld(ctx, "TXP", 0);
    else if (ctx->instruction_controls == CONTROL_TEXLDB)
        arb1_texld(ctx, "TXB", 0);
} // emit_ARB1_TEXLD

#endif  // SUPPORT_PROFILE_ARB1


#if !AT_LEAST_ONE_PROFILE
#error No profiles are supported. Fix your build.
#endif

#define DEFINE_PROFILE(prof) { \
    MOJOSHADER_PROFILE_##prof, \
    emit_##prof##_start, \
    emit_##prof##_end, \
    emit_##prof##_phase, \
    emit_##prof##_global, \
    emit_##prof##_array, \
    emit_##prof##_const_array, \
    emit_##prof##_uniform, \
    emit_##prof##_sampler, \
    emit_##prof##_attribute, \
    emit_##prof##_finalize, \
    get_##prof##_varname, \
    get_##prof##_const_array_varname, \
},

static const Profile profiles[] =
{
#if SUPPORT_PROFILE_D3D
    DEFINE_PROFILE(D3D)
#endif
#if SUPPORT_PROFILE_BYTECODE
    DEFINE_PROFILE(BYTECODE)
#endif
#if SUPPORT_PROFILE_GLSL
    DEFINE_PROFILE(GLSL)
#endif
#if SUPPORT_PROFILE_ARB1
    DEFINE_PROFILE(ARB1)
#endif
#if SUPPORT_PROFILE_METAL
    DEFINE_PROFILE(METAL)
#endif
};

#undef DEFINE_PROFILE

// This is for profiles that extend other profiles...
static const struct { const char *from; const char *to; } profileMap[] =
{
    { MOJOSHADER_PROFILE_GLSLES, MOJOSHADER_PROFILE_GLSL },
    { MOJOSHADER_PROFILE_GLSL120, MOJOSHADER_PROFILE_GLSL },
    { MOJOSHADER_PROFILE_NV2, MOJOSHADER_PROFILE_ARB1 },
    { MOJOSHADER_PROFILE_NV3, MOJOSHADER_PROFILE_ARB1 },
    { MOJOSHADER_PROFILE_NV4, MOJOSHADER_PROFILE_ARB1 },
};


// The PROFILE_EMITTER_* items MUST be in the same order as profiles[]!
#define PROFILE_EMITTERS(op) { \
     PROFILE_EMITTER_D3D(op) \
     PROFILE_EMITTER_BYTECODE(op) \
     PROFILE_EMITTER_GLSL(op) \
     PROFILE_EMITTER_ARB1(op) \
     PROFILE_EMITTER_METAL(op) \
}

static int parse_destination_token(Context *ctx, DestArgInfo *info)
{
    // !!! FIXME: recheck against the spec for ranges (like RASTOUT values, etc).
    if (ctx->tokencount == 0)
    {
        fail(ctx, "Out of tokens in destination parameter");
        return 0;
    } // if

    const uint32 token = SWAP32(*(ctx->tokens));
    const int reserved1 = (int) ((token >> 14) & 0x3); // bits 14 through 15
    const int reserved2 = (int) ((token >> 31) & 0x1); // bit 31

    info->token = ctx->tokens;
    info->regnum = (int) (token & 0x7ff);  // bits 0 through 10
    info->relative = (int) ((token >> 13) & 0x1); // bit 13
    info->orig_writemask = (int) ((token >> 16) & 0xF); // bits 16 through 19
    info->result_mod = (int) ((token >> 20) & 0xF); // bits 20 through 23
    info->result_shift = (int) ((token >> 24) & 0xF); // bits 24 through 27      abc
    info->regtype = (RegisterType) (((token >> 28) & 0x7) | ((token >> 8) & 0x18));  // bits 28-30, 11-12

    int writemask;
    if (isscalar(ctx, ctx->shader_type, info->regtype, info->regnum))
        writemask = 0x1;  // just x.
    else
        writemask = info->orig_writemask;

    set_dstarg_writemask(info, writemask);  // bits 16 through 19.

    // all the REG_TYPE_CONSTx types are the same register type, it's just
    //  split up so its regnum can be > 2047 in the bytecode. Clean it up.
    if (info->regtype == REG_TYPE_CONST2)
    {
        info->regtype = REG_TYPE_CONST;
        info->regnum += 2048;
    } // else if
    else if (info->regtype == REG_TYPE_CONST3)
    {
        info->regtype = REG_TYPE_CONST;
        info->regnum += 4096;
    } // else if
    else if (info->regtype == REG_TYPE_CONST4)
    {
        info->regtype = REG_TYPE_CONST;
        info->regnum += 6144;
    } // else if

    // swallow token for now, for multiple calls in a row.
    adjust_token_position(ctx, 1);

    if (reserved1 != 0x0)
        fail(ctx, "Reserved bit #1 in destination token must be zero");

    if (reserved2 != 0x1)
        fail(ctx, "Reserved bit #2 in destination token must be one");

    if (info->relative)
    {
        if (!shader_is_vertex(ctx))
            fail(ctx, "Relative addressing in non-vertex shader");
        if (!shader_version_atleast(ctx, 3, 0))
            fail(ctx, "Relative addressing in vertex shader version < 3.0");
        if ((!ctx->ctab.have_ctab) && (!ctx->ignores_ctab))
        {
            // it's hard to do this efficiently without!
            fail(ctx, "relative addressing unsupported without a CTAB");
        } // if

        // !!! FIXME: I don't have a shader that has a relative dest currently.
        fail(ctx, "Relative addressing of dest tokens is unsupported");
        return 2;
    } // if

    const int s = info->result_shift;
    if (s != 0)
    {
        if (!shader_is_pixel(ctx))
            fail(ctx, "Result shift scale in non-pixel shader");
        if (shader_version_atleast(ctx, 2, 0))
            fail(ctx, "Result shift scale in pixel shader version >= 2.0");
        if ( ! (((s >= 1) && (s <= 3)) || ((s >= 0xD) && (s <= 0xF))) )
            fail(ctx, "Result shift scale isn't 1 to 3, or 13 to 15.");
    } // if

    if (info->result_mod & MOD_PP)  // Partial precision (pixel shaders only)
    {
        if (!shader_is_pixel(ctx))
            fail(ctx, "Partial precision result mod in non-pixel shader");
    } // if

    if (info->result_mod & MOD_CENTROID)  // Centroid (pixel shaders only)
    {
        if (!shader_is_pixel(ctx))
            fail(ctx, "Centroid result mod in non-pixel shader");
        else if (!ctx->centroid_allowed)  // only on DCL opcodes!
            fail(ctx, "Centroid modifier not allowed here");
    } // if

    if (/*(info->regtype < 0) ||*/ (info->regtype > REG_TYPE_MAX))
        fail(ctx, "Register type is out of range");

    if (!isfail(ctx))
        set_used_register(ctx, info->regtype, info->regnum, 1);

    return 1;
} // parse_destination_token


static void determine_constants_arrays(Context *ctx)
{
    // Only process this stuff once. This is called after all DEF* opcodes
    //  could have been parsed.
    if (ctx->determined_constants_arrays)
        return;

    ctx->determined_constants_arrays = 1;

    if (ctx->constant_count <= 1)
        return;  // nothing to sort or group.

    // Sort the linked list into an array for easier tapdancing...
    ConstantsList **array = (ConstantsList **) alloca(sizeof (ConstantsList *) * (ctx->constant_count + 1));
    ConstantsList *item = ctx->constants;
    int i;

    for (i = 0; i < ctx->constant_count; i++)
    {
        if (item == NULL)
        {
            fail(ctx, "BUG: mismatched constant list and count");
            return;
        } // if

        array[i] = item;
        item = item->next;
    } // for

    array[ctx->constant_count] = NULL;

    // bubble sort ftw.
    int sorted;
    do
    {
        sorted = 1;
        for (i = 0; i < ctx->constant_count-1; i++)
        {
            if (array[i]->constant.index > array[i+1]->constant.index)
            {
                ConstantsList *tmp = array[i];
                array[i] = array[i+1];
                array[i+1] = tmp;
                sorted = 0;
            } // if
        } // for
    } while (!sorted);

    // okay, sorted. While we're here, let's redo the linked list in order...
    for (i = 0; i < ctx->constant_count; i++)
        array[i]->next = array[i+1];
    ctx->constants = array[0];

    // now figure out the groupings of constants and add to ctx->variables...
    int start = -1;
    int prev = -1;
    int count = 0;
    const int hi = ctx->constant_count;
    for (i = 0; i <= hi; i++)
    {
        if (array[i] && (array[i]->constant.type != MOJOSHADER_UNIFORM_FLOAT))
            continue;  // we only care about REG_TYPE_CONST for array groups.

        if (start == -1)
        {
            prev = start = i;  // first REG_TYPE_CONST we've seen. Mark it!
            continue;
        } // if

        // not a match (or last item in the array)...see if we had a
        //  contiguous set before this point...
        if ( (array[i]) && (array[i]->constant.index == (array[prev]->constant.index + 1)) )
            count++;
        else
        {
            if (count > 0)  // multiple constants in the set?
            {
                VariableList *var;
                var = (VariableList *) Malloc(ctx, sizeof (VariableList));
                if (var == NULL)
                    break;

                var->type = MOJOSHADER_UNIFORM_FLOAT;
                var->index = array[start]->constant.index;
                var->count = (array[prev]->constant.index - var->index) + 1;
                var->constant = array[start];
                var->used = 0;
                var->emit_position = -1;
                var->next = ctx->variables;
                ctx->variables = var;
            } // else

            start = i;   // set this as new start of sequence.
        } // if

        prev = i;
    } // for
} // determine_constants_arrays


static int adjust_swizzle(const Context *ctx, const RegisterType regtype,
                          const int regnum, const int swizzle)
{
    if (regtype != REG_TYPE_INPUT)  // !!! FIXME: maybe lift this later?
        return swizzle;
    else if (ctx->swizzles_count == 0)
        return swizzle;

    const RegisterList *reg = reglist_find(&ctx->attributes, regtype, regnum);
    if (reg == NULL)
        return swizzle;

    size_t i;
    for (i = 0; i < ctx->swizzles_count; i++)
    {
        const MOJOSHADER_swizzle *swiz = &ctx->swizzles[i];
        if ((swiz->usage == reg->usage) && (swiz->index == reg->index))
        {
            return ( (((int)(swiz->swizzles[((swizzle >> 0) & 0x3)])) << 0) |
                     (((int)(swiz->swizzles[((swizzle >> 2) & 0x3)])) << 2) |
                     (((int)(swiz->swizzles[((swizzle >> 4) & 0x3)])) << 4) |
                     (((int)(swiz->swizzles[((swizzle >> 6) & 0x3)])) << 6) );
        } // if
    } // for

    return swizzle;
} // adjust_swizzle


static int parse_source_token(Context *ctx, SourceArgInfo *info)
{
    int retval = 1;

    if (ctx->tokencount == 0)
    {
        fail(ctx, "Out of tokens in source parameter");
        return 0;
    } // if

    const uint32 token = SWAP32(*(ctx->tokens));
    const int reserved1 = (int) ((token >> 14) & 0x3); // bits 14 through 15
    const int reserved2 = (int) ((token >> 31) & 0x1); // bit 31

    info->token = ctx->tokens;
    info->regnum = (int) (token & 0x7ff);  // bits 0 through 10
    info->relative = (int) ((token >> 13) & 0x1); // bit 13
    const int swizzle = (int) ((token >> 16) & 0xFF); // bits 16 through 23
    info->src_mod = (SourceMod) ((token >> 24) & 0xF); // bits 24 through 27
    info->regtype = (RegisterType) (((token >> 28) & 0x7) | ((token >> 8) & 0x18));  // bits 28-30, 11-12

    // all the REG_TYPE_CONSTx types are the same register type, it's just
    //  split up so its regnum can be > 2047 in the bytecode. Clean it up.
    if (info->regtype == REG_TYPE_CONST2)
    {
        info->regtype = REG_TYPE_CONST;
        info->regnum += 2048;
    } // else if
    else if (info->regtype == REG_TYPE_CONST3)
    {
        info->regtype = REG_TYPE_CONST;
        info->regnum += 4096;
    } // else if
    else if (info->regtype == REG_TYPE_CONST4)
    {
        info->regtype = REG_TYPE_CONST;
        info->regnum += 6144;
    } // else if

    info->swizzle = adjust_swizzle(ctx, info->regtype, info->regnum, swizzle);
    info->swizzle_x = ((info->swizzle >> 0) & 0x3);
    info->swizzle_y = ((info->swizzle >> 2) & 0x3);
    info->swizzle_z = ((info->swizzle >> 4) & 0x3);
    info->swizzle_w = ((info->swizzle >> 6) & 0x3);

    // swallow token for now, for multiple calls in a row.
    adjust_token_position(ctx, 1);

    if (reserved1 != 0x0)
        fail(ctx, "Reserved bits #1 in source token must be zero");

    if (reserved2 != 0x1)
        fail(ctx, "Reserved bit #2 in source token must be one");

    if ((info->relative) && (ctx->tokencount == 0))
    {
        fail(ctx, "Out of tokens in relative source parameter");
        info->relative = 0;  // don't try to process it.
    } // if

    if (info->relative)
    {
        if ( (shader_is_pixel(ctx)) && (!shader_version_atleast(ctx, 3, 0)) )
            fail(ctx, "Relative addressing in pixel shader version < 3.0");

        // Shader Model 1 doesn't have an extra token to specify the
        //  relative register: it's always a0.x.
        if (!shader_version_atleast(ctx, 2, 0))
        {
            info->relative_regnum = 0;
            info->relative_regtype = REG_TYPE_ADDRESS;
            info->relative_component = 0;
        } // if

        else  // Shader Model 2 and later...
        {
            const uint32 reltoken = SWAP32(*(ctx->tokens));
            // swallow token for now, for multiple calls in a row.
            adjust_token_position(ctx, 1);

            const int relswiz = (int) ((reltoken >> 16) & 0xFF);
            info->relative_regnum = (int) (reltoken & 0x7ff);
            info->relative_regtype = (RegisterType)
                                        (((reltoken >> 28) & 0x7) |
                                        ((reltoken >> 8) & 0x18));

            if (((reltoken >> 31) & 0x1) == 0)
                fail(ctx, "bit #31 in relative address must be set");

            if ((reltoken & 0xF00E000) != 0)  // usused bits.
                fail(ctx, "relative address reserved bit must be zero");

            switch (info->relative_regtype)
            {
                case REG_TYPE_LOOP:
                case REG_TYPE_ADDRESS:
                    break;
                default:
                    fail(ctx, "invalid register for relative address");
                    break;
            } // switch

            if (info->relative_regnum != 0)  // true for now.
                fail(ctx, "invalid register for relative address");

            if (!replicate_swizzle(relswiz))
                fail(ctx, "relative address needs replicate swizzle");

            info->relative_component = (relswiz & 0x3);

            retval++;
        } // else

        if (info->regtype == REG_TYPE_INPUT)
        {
            if ( (shader_is_pixel(ctx)) || (!shader_version_atleast(ctx, 3, 0)) )
                fail(ctx, "relative addressing of input registers not supported in this shader model");
            ctx->have_relative_input_registers = 1;
        } // if
        else if (info->regtype == REG_TYPE_CONST)
        {
            // figure out what array we're in...
            if (!ctx->ignores_ctab)
            {
                if (!ctx->ctab.have_ctab)  // hard to do efficiently without!
                    fail(ctx, "relative addressing unsupported without a CTAB");
                else
                {
                    determine_constants_arrays(ctx);

                    VariableList *var;
                    const int reltarget = info->regnum;
                    for (var = ctx->variables; var != NULL; var = var->next)
                    {
                        const int lo = var->index;
                        if ( (reltarget >= lo) && (reltarget < (lo + var->count)) )
                            break;  // match!
                    } // for

                    if (var == NULL)
                        fail(ctx, "relative addressing of indeterminate array");
                    else
                    {
                        var->used = 1;
                        info->relative_array = var;
                        set_used_register(ctx, info->relative_regtype, info->relative_regnum, 0);
                    } // else
                } // else
            } // if
        } // else if
        else
        {
            fail(ctx, "relative addressing of invalid register");
        } // else
    } // if

    switch (info->src_mod)
    {
        case SRCMOD_NONE:
        case SRCMOD_ABSNEGATE:
        case SRCMOD_ABS:
        case SRCMOD_NEGATE:
            break; // okay in any shader model.

        // apparently these are only legal in Shader Model 1.x ...
        case SRCMOD_BIASNEGATE:
        case SRCMOD_BIAS:
        case SRCMOD_SIGNNEGATE:
        case SRCMOD_SIGN:
        case SRCMOD_COMPLEMENT:
        case SRCMOD_X2NEGATE:
        case SRCMOD_X2:
        case SRCMOD_DZ:
        case SRCMOD_DW:
            if (shader_version_atleast(ctx, 2, 0))
                fail(ctx, "illegal source mod for this Shader Model.");
            break;

        case SRCMOD_NOT:  // !!! FIXME: I _think_ this is right...
            if (shader_version_atleast(ctx, 2, 0))
            {
                if (info->regtype != REG_TYPE_PREDICATE)
                    fail(ctx, "NOT only allowed on predicate register.");
            } // if
            break;

        default:
            fail(ctx, "Unknown source modifier");
    } // switch

    // !!! FIXME: docs say this for sm3 ... check these!
    //  "The negate modifier cannot be used on second source register of these
    //   instructions: m3x2 - ps, m3x3 - ps, m3x4 - ps, m4x3 - ps, and
    //   m4x4 - ps."
    //  "If any version 3 shader reads from one or more constant float
    //   registers (c#), one of the following must be true.
    //    All of the constant floating-point registers must use the abs modifier.
    //    None of the constant floating-point registers can use the abs modifier.

    if (!isfail(ctx))
    {
        RegisterList *reg;
        reg = set_used_register(ctx, info->regtype, info->regnum, 0);
        // !!! FIXME: this test passes if you write to the register
        // !!! FIXME:  in this same instruction, because we parse the
        // !!! FIXME:  destination token first.
        // !!! FIXME: Microsoft's shader validation explicitly checks temp
        // !!! FIXME:  registers for this...do they check other writable ones?
        if ((info->regtype == REG_TYPE_TEMP) && (reg) && (!reg->written))
            failf(ctx, "Temp register r%d used uninitialized", info->regnum);
    } // if

    return retval;
} // parse_source_token


static int parse_predicated_token(Context *ctx)
{
    SourceArgInfo *arg = &ctx->predicate_arg;
    parse_source_token(ctx, arg);
    if (arg->regtype != REG_TYPE_PREDICATE)
        fail(ctx, "Predicated instruction but not predicate register!");
    if ((arg->src_mod != SRCMOD_NONE) && (arg->src_mod != SRCMOD_NOT))
        fail(ctx, "Predicated instruction register is not NONE or NOT");
    if ( !no_swizzle(arg->swizzle) && !replicate_swizzle(arg->swizzle) )
        fail(ctx, "Predicated instruction register has wrong swizzle");
    if (arg->relative)  // I'm pretty sure this is illegal...?
        fail(ctx, "relative addressing in predicated token");

    return 1;
} // parse_predicated_token


static int parse_args_NULL(Context *ctx)
{
    return 1;
} // parse_args_NULL


static int parse_args_DEF(Context *ctx)
{
    parse_destination_token(ctx, &ctx->dest_arg);
    if (ctx->dest_arg.regtype != REG_TYPE_CONST)
        fail(ctx, "DEF using non-CONST register");
    if (ctx->dest_arg.relative)  // I'm pretty sure this is illegal...?
        fail(ctx, "relative addressing in DEF");

    ctx->dwords[0] = SWAP32(ctx->tokens[0]);
    ctx->dwords[1] = SWAP32(ctx->tokens[1]);
    ctx->dwords[2] = SWAP32(ctx->tokens[2]);
    ctx->dwords[3] = SWAP32(ctx->tokens[3]);

    return 6;
} // parse_args_DEF


static int parse_args_DEFI(Context *ctx)
{
    parse_destination_token(ctx, &ctx->dest_arg);
    if (ctx->dest_arg.regtype != REG_TYPE_CONSTINT)
        fail(ctx, "DEFI using non-CONSTING register");
    if (ctx->dest_arg.relative)  // I'm pretty sure this is illegal...?
        fail(ctx, "relative addressing in DEFI");

    ctx->dwords[0] = SWAP32(ctx->tokens[0]);
    ctx->dwords[1] = SWAP32(ctx->tokens[1]);
    ctx->dwords[2] = SWAP32(ctx->tokens[2]);
    ctx->dwords[3] = SWAP32(ctx->tokens[3]);

    return 6;
} // parse_args_DEFI


static int parse_args_DEFB(Context *ctx)
{
    parse_destination_token(ctx, &ctx->dest_arg);
    if (ctx->dest_arg.regtype != REG_TYPE_CONSTBOOL)
        fail(ctx, "DEFB using non-CONSTBOOL register");
    if (ctx->dest_arg.relative)  // I'm pretty sure this is illegal...?
        fail(ctx, "relative addressing in DEFB");

    ctx->dwords[0] = *(ctx->tokens) ? 1 : 0;

    return 3;
} // parse_args_DEFB


static int valid_texture_type(const uint32 ttype)
{
    switch ((const TextureType) ttype)
    {
        case TEXTURE_TYPE_2D:
        case TEXTURE_TYPE_CUBE:
        case TEXTURE_TYPE_VOLUME:
            return 1;  // it's okay.
    } // switch

    return 0;
} // valid_texture_type


// !!! FIXME: this function is kind of a mess.
static int parse_args_DCL(Context *ctx)
{
    int unsupported = 0;
    const uint32 token = SWAP32(*(ctx->tokens));
    const int reserved1 = (int) ((token >> 31) & 0x1); // bit 31
    uint32 reserved_mask = 0x00000000;

    if (reserved1 != 0x1)
        fail(ctx, "Bit #31 in DCL token must be one");

    ctx->centroid_allowed = 1;
    adjust_token_position(ctx, 1);
    parse_destination_token(ctx, &ctx->dest_arg);
    ctx->centroid_allowed = 0;

    if (ctx->dest_arg.result_shift != 0)  // I'm pretty sure this is illegal...?
        fail(ctx, "shift scale in DCL");
    if (ctx->dest_arg.relative)  // I'm pretty sure this is illegal...?
        fail(ctx, "relative addressing in DCL");

    const RegisterType regtype = ctx->dest_arg.regtype;
    const int regnum = ctx->dest_arg.regnum;
    if ( (shader_is_pixel(ctx)) && (shader_version_atleast(ctx, 3, 0)) )
    {
        if (regtype == REG_TYPE_INPUT)
        {
            const uint32 usage = (token & 0xF);
            const uint32 index = ((token >> 16) & 0xF);
            reserved_mask = 0x7FF0FFE0;
            ctx->dwords[0] = usage;
            ctx->dwords[1] = index;
        } // if

        else if (regtype == REG_TYPE_MISCTYPE)
        {
            const MiscTypeType mt = (MiscTypeType) regnum;
            if (mt == MISCTYPE_TYPE_POSITION)
                reserved_mask = 0x7FFFFFFF;
            else if (mt == MISCTYPE_TYPE_FACE)
            {
                reserved_mask = 0x7FFFFFFF;
                if (!writemask_xyzw(ctx->dest_arg.orig_writemask))
                    fail(ctx, "DCL face writemask must be full");
                if (ctx->dest_arg.result_mod != 0)
                    fail(ctx, "DCL face result modifier must be zero");
                if (ctx->dest_arg.result_shift != 0)
                    fail(ctx, "DCL face shift scale must be zero");
            } // else if
            else
            {
                unsupported = 1;
            } // else

            ctx->dwords[0] = (uint32) MOJOSHADER_USAGE_UNKNOWN;
            ctx->dwords[1] = 0;
        } // else if

        else if (regtype == REG_TYPE_TEXTURE)
        {
            const uint32 usage = (token & 0xF);
            const uint32 index = ((token >> 16) & 0xF);
            if (usage == MOJOSHADER_USAGE_TEXCOORD)
            {
                if (index > 7)
                    fail(ctx, "DCL texcoord usage must have 0-7 index");
            } // if
            else if (usage == MOJOSHADER_USAGE_COLOR)
            {
                if (index != 0)
                    fail(ctx, "DCL color usage must have 0 index");
            } // else if
            else
            {
                fail(ctx, "Invalid DCL texture usage");
            } // else

            reserved_mask = 0x7FF0FFE0;
            ctx->dwords[0] = usage;
            ctx->dwords[1] = index;
        } // else if

        else if (regtype == REG_TYPE_SAMPLER)
        {
            const uint32 ttype = ((token >> 27) & 0xF);
            if (!valid_texture_type(ttype))
                fail(ctx, "unknown sampler texture type");
            reserved_mask = 0x7FFFFFF;
            ctx->dwords[0] = ttype;
        } // else if

        else
        {
            unsupported = 1;
        } // else
    } // if

    else if ( (shader_is_pixel(ctx)) && (shader_version_atleast(ctx, 2, 0)) )
    {
        if (regtype == REG_TYPE_INPUT)
        {
            ctx->dwords[0] = (uint32) MOJOSHADER_USAGE_COLOR;
            ctx->dwords[1] = regnum;
            reserved_mask = 0x7FFFFFFF;
        } // if
        else if (regtype == REG_TYPE_TEXTURE)
        {
            ctx->dwords[0] = (uint32) MOJOSHADER_USAGE_TEXCOORD;
            ctx->dwords[1] = regnum;
            reserved_mask = 0x7FFFFFFF;
        } // else if
        else if (regtype == REG_TYPE_SAMPLER)
        {
            const uint32 ttype = ((token >> 27) & 0xF);
            if (!valid_texture_type(ttype))
                fail(ctx, "unknown sampler texture type");
            reserved_mask = 0x7FFFFFF;
            ctx->dwords[0] = ttype;
        } // else if
        else
        {
            unsupported = 1;
        } // else
    } // if

    else if ( (shader_is_vertex(ctx)) && (shader_version_atleast(ctx, 3, 0)) )
    {
        if ((regtype == REG_TYPE_INPUT) || (regtype == REG_TYPE_OUTPUT))
        {
            const uint32 usage = (token & 0xF);
            const uint32 index = ((token >> 16) & 0xF);
            reserved_mask = 0x7FF0FFE0;
            ctx->dwords[0] = usage;
            ctx->dwords[1] = index;
        } // if
        else if (regtype == REG_TYPE_TEXTURE)
        {
            const uint32 usage = (token & 0xF);
            const uint32 index = ((token >> 16) & 0xF);
            if (usage == MOJOSHADER_USAGE_TEXCOORD)
            {
                if (index > 7)
                    fail(ctx, "DCL texcoord usage must have 0-7 index");
            } // if
            else if (usage == MOJOSHADER_USAGE_COLOR)
            {
                if (index != 0)
                    fail(ctx, "DCL texcoord usage must have 0 index");
            } // else if
            else
                fail(ctx, "Invalid DCL texture usage");

            reserved_mask = 0x7FF0FFE0;
            ctx->dwords[0] = usage;
            ctx->dwords[1] = index;
        } // else if
        else if (regtype == REG_TYPE_SAMPLER)
        {
            const uint32 ttype = ((token >> 27) & 0xF);
            if (!valid_texture_type(ttype))
                fail(ctx, "Unknown sampler texture type");
            reserved_mask = 0x6FFFFFFF;
            ctx->dwords[0] = ttype;
        } // else if
        else
        {
            unsupported = 1;
        } // else
    } // else if

    else if ( (shader_is_vertex(ctx)) && (shader_version_atleast(ctx, 1, 1)) )
    {
        if (regtype == REG_TYPE_INPUT)
        {
            const uint32 usage = (token & 0xF);
            const uint32 index = ((token >> 16) & 0xF);
            reserved_mask = 0x7FF0FFE0;
            ctx->dwords[0] = usage;
            ctx->dwords[1] = index;
        } // if
        else
        {
            unsupported = 1;
        } // else
    } // else if

    else
    {
        unsupported = 1;
    } // else

    if (unsupported)
        fail(ctx, "invalid DCL register type for this shader model");

    if ((token & reserved_mask) != 0)
        fail(ctx, "reserved bits in DCL dword aren't zero");

    return 3;
} // parse_args_DCL


static int parse_args_D(Context *ctx)
{
    int retval = 1;
    retval += parse_destination_token(ctx, &ctx->dest_arg);
    return retval;
} // parse_args_D


static int parse_args_S(Context *ctx)
{
    int retval = 1;
    retval += parse_source_token(ctx, &ctx->source_args[0]);
    return retval;
} // parse_args_S


static int parse_args_SS(Context *ctx)
{
    int retval = 1;
    retval += parse_source_token(ctx, &ctx->source_args[0]);
    retval += parse_source_token(ctx, &ctx->source_args[1]);
    return retval;
} // parse_args_SS


static int parse_args_DS(Context *ctx)
{
    int retval = 1;
    retval += parse_destination_token(ctx, &ctx->dest_arg);
    retval += parse_source_token(ctx, &ctx->source_args[0]);
    return retval;
} // parse_args_DS


static int parse_args_DSS(Context *ctx)
{
    int retval = 1;
    retval += parse_destination_token(ctx, &ctx->dest_arg);
    retval += parse_source_token(ctx, &ctx->source_args[0]);
    retval += parse_source_token(ctx, &ctx->source_args[1]);
    return retval;
} // parse_args_DSS


static int parse_args_DSSS(Context *ctx)
{
    int retval = 1;
    retval += parse_destination_token(ctx, &ctx->dest_arg);
    retval += parse_source_token(ctx, &ctx->source_args[0]);
    retval += parse_source_token(ctx, &ctx->source_args[1]);
    retval += parse_source_token(ctx, &ctx->source_args[2]);
    return retval;
} // parse_args_DSSS


static int parse_args_DSSSS(Context *ctx)
{
    int retval = 1;
    retval += parse_destination_token(ctx, &ctx->dest_arg);
    retval += parse_source_token(ctx, &ctx->source_args[0]);
    retval += parse_source_token(ctx, &ctx->source_args[1]);
    retval += parse_source_token(ctx, &ctx->source_args[2]);
    retval += parse_source_token(ctx, &ctx->source_args[3]);
    return retval;
} // parse_args_DSSSS


static int parse_args_SINCOS(Context *ctx)
{
    // this opcode needs extra registers for sm2 and lower.
    if (!shader_version_atleast(ctx, 3, 0))
        return parse_args_DSSS(ctx);
    return parse_args_DS(ctx);
} // parse_args_SINCOS


static int parse_args_TEXCRD(Context *ctx)
{
    // added extra register in ps_1_4.
    if (shader_version_atleast(ctx, 1, 4))
        return parse_args_DS(ctx);
    return parse_args_D(ctx);
} // parse_args_TEXCRD


static int parse_args_TEXLD(Context *ctx)
{
    // different registers in px_1_3, ps_1_4, and ps_2_0!
    if (shader_version_atleast(ctx, 2, 0))
        return parse_args_DSS(ctx);
    else if (shader_version_atleast(ctx, 1, 4))
        return parse_args_DS(ctx);
    return parse_args_D(ctx);
} // parse_args_TEXLD


// State machine functions...

static ConstantsList *alloc_constant_listitem(Context *ctx)
{
    ConstantsList *item = (ConstantsList *) Malloc(ctx, sizeof (ConstantsList));
    if (item == NULL)
        return NULL;

    memset(&item->constant, '\0', sizeof (MOJOSHADER_constant));
    item->next = ctx->constants;
    ctx->constants = item;
    ctx->constant_count++;

    return item;
} // alloc_constant_listitem


static void state_DEF(Context *ctx)
{
    const RegisterType regtype = ctx->dest_arg.regtype;
    const int regnum = ctx->dest_arg.regnum;

    // !!! FIXME: fail if same register is defined twice.

    if (ctx->instruction_count != 0)
        fail(ctx, "DEF token must come before any instructions");
    else if (regtype != REG_TYPE_CONST)
        fail(ctx, "DEF token using invalid register");
    else
    {
        ConstantsList *item = alloc_constant_listitem(ctx);
        if (item != NULL)
        {
            item->constant.index = regnum;
            item->constant.type = MOJOSHADER_UNIFORM_FLOAT;
            memcpy(item->constant.value.f, ctx->dwords,
                   sizeof (item->constant.value.f));
            set_defined_register(ctx, regtype, regnum);
        } // if
    } // else
} // state_DEF

static void state_DEFI(Context *ctx)
{
    const RegisterType regtype = ctx->dest_arg.regtype;
    const int regnum = ctx->dest_arg.regnum;

    // !!! FIXME: fail if same register is defined twice.

    if (ctx->instruction_count != 0)
        fail(ctx, "DEFI token must come before any instructions");
    else if (regtype != REG_TYPE_CONSTINT)
        fail(ctx, "DEFI token using invalid register");
    else
    {
        ConstantsList *item = alloc_constant_listitem(ctx);
        if (item != NULL)
        {
            item->constant.index = regnum;
            item->constant.type = MOJOSHADER_UNIFORM_INT;
            memcpy(item->constant.value.i, ctx->dwords,
                   sizeof (item->constant.value.i));

            set_defined_register(ctx, regtype, regnum);
        } // if
    } // else
} // state_DEFI

static void state_DEFB(Context *ctx)
{
    const RegisterType regtype = ctx->dest_arg.regtype;
    const int regnum = ctx->dest_arg.regnum;

    // !!! FIXME: fail if same register is defined twice.

    if (ctx->instruction_count != 0)
        fail(ctx, "DEFB token must come before any instructions");
    else if (regtype != REG_TYPE_CONSTBOOL)
        fail(ctx, "DEFB token using invalid register");
    else
    {
        ConstantsList *item = alloc_constant_listitem(ctx);
        if (item != NULL)
        {
            item->constant.index = regnum;
            item->constant.type = MOJOSHADER_UNIFORM_BOOL;
            item->constant.value.b = ctx->dwords[0] ? 1 : 0;
            set_defined_register(ctx, regtype, regnum);
        } // if
    } // else
} // state_DEFB

static void state_DCL(Context *ctx)
{
    const DestArgInfo *arg = &ctx->dest_arg;
    const RegisterType regtype = arg->regtype;
    const int regnum = arg->regnum;
    const int wmask = arg->writemask;
    const int mods = arg->result_mod;

    // parse_args_DCL() does a lot of state checking before we get here.

    // !!! FIXME: apparently vs_3_0 can use sampler registers now.
    // !!! FIXME:  (but only s0 through s3, not all 16 of them.)

    if (ctx->instruction_count != 0)
        fail(ctx, "DCL token must come before any instructions");

    else if (shader_is_vertex(ctx))
    {
        if (regtype == REG_TYPE_SAMPLER)
            add_sampler(ctx, regnum, (TextureType) ctx->dwords[0], 0);
        else
        {
            const MOJOSHADER_usage usage = (const MOJOSHADER_usage) ctx->dwords[0];
            const int index = ctx->dwords[1];
            if (usage >= MOJOSHADER_USAGE_TOTAL)
            {
                fail(ctx, "unknown DCL usage");
                return;
            } // if
            add_attribute_register(ctx, regtype, regnum, usage, index, wmask, mods);
        } // else
    } // if

    else if (shader_is_pixel(ctx))
    {
        if (regtype == REG_TYPE_SAMPLER)
            add_sampler(ctx, regnum, (TextureType) ctx->dwords[0], 0);
        else
        {
            const MOJOSHADER_usage usage = (MOJOSHADER_usage) ctx->dwords[0];
            const int index = ctx->dwords[1];
            add_attribute_register(ctx, regtype, regnum, usage, index, wmask, mods);
        } // else
    } // else if

    else
    {
        fail(ctx, "unsupported shader type."); // should be caught elsewhere.
        return;
    } // else

    set_defined_register(ctx, regtype, regnum);
} // state_DCL

static void state_TEXCRD(Context *ctx)
{
    if (shader_version_atleast(ctx, 2, 0))
        fail(ctx, "TEXCRD in Shader Model >= 2.0");  // apparently removed.
} // state_TEXCRD

static void state_FRC(Context *ctx)
{
    const DestArgInfo *dst = &ctx->dest_arg;

    if (dst->result_mod & MOD_SATURATE)  // according to msdn...
        fail(ctx, "FRC destination can't use saturate modifier");

    else if (!shader_version_atleast(ctx, 2, 0))
    {
        if (!writemask_y(dst->writemask) && !writemask_xy(dst->writemask))
            fail(ctx, "FRC writemask must be .y or .xy for shader model 1.x");
    } // else if
} // state_FRC


// replicate the matrix registers to source args. The D3D profile will
//  only use the one legitimate argument, but this saves other profiles
//  from having to build this.
static void srcarg_matrix_replicate(Context *ctx, const int idx,
                                       const int rows)
{
    int i;
    SourceArgInfo *src = &ctx->source_args[idx];
    SourceArgInfo *dst = &ctx->source_args[idx+1];
    for (i = 0; i < (rows-1); i++, dst++)
    {
        memcpy(dst, src, sizeof (SourceArgInfo));
        dst->regnum += (i + 1);
        set_used_register(ctx, dst->regtype, dst->regnum, 0);
    } // for
} // srcarg_matrix_replicate

static void state_M4X4(Context *ctx)
{
    const DestArgInfo *info = &ctx->dest_arg;
    if (!writemask_xyzw(info->writemask))
        fail(ctx, "M4X4 writemask must be full");

// !!! FIXME: MSDN:
//The xyzw (default) mask is required for the destination register. Negate and swizzle modifiers are allowed for src0, but not for src1.
//Swizzle and negate modifiers are invalid for the src0 register. The dest and src0 registers cannot be the same.

    srcarg_matrix_replicate(ctx, 1, 4);
} // state_M4X4

static void state_M4X3(Context *ctx)
{
    const DestArgInfo *info = &ctx->dest_arg;
    if (!writemask_xyz(info->writemask))
        fail(ctx, "M4X3 writemask must be .xyz");

// !!! FIXME: MSDN stuff

    srcarg_matrix_replicate(ctx, 1, 3);
} // state_M4X3

static void state_M3X4(Context *ctx)
{
    const DestArgInfo *info = &ctx->dest_arg;
    if (!writemask_xyzw(info->writemask))
        fail(ctx, "M3X4 writemask must be .xyzw");

// !!! FIXME: MSDN stuff

    srcarg_matrix_replicate(ctx, 1, 4);
} // state_M3X4

static void state_M3X3(Context *ctx)
{
    const DestArgInfo *info = &ctx->dest_arg;
    if (!writemask_xyz(info->writemask))
        fail(ctx, "M3X3 writemask must be .xyz");

// !!! FIXME: MSDN stuff

    srcarg_matrix_replicate(ctx, 1, 3);
} // state_M3X3

static void state_M3X2(Context *ctx)
{
    const DestArgInfo *info = &ctx->dest_arg;
    if (!writemask_xy(info->writemask))
        fail(ctx, "M3X2 writemask must be .xy");

// !!! FIXME: MSDN stuff

    srcarg_matrix_replicate(ctx, 1, 2);
} // state_M3X2

static void state_RET(Context *ctx)
{
    // MSDN all but says that assembly shaders are more or less serialized
    //  HLSL functions, and a RET means you're at the end of one, unlike how
    //  most CPUs would behave. This is actually really helpful,
    //  since we can use high-level constructs and not a mess of GOTOs,
    //  which is a godsend for GLSL...this also means we can consider things
    //  like a LOOP without a matching ENDLOOP within a label's section as
    //  an error.
    if (ctx->loops > 0)
        fail(ctx, "LOOP without ENDLOOP");
    if (ctx->reps > 0)
        fail(ctx, "REP without ENDREP");
} // state_RET

static void check_label_register(Context *ctx, int arg, const char *opcode)
{
    const SourceArgInfo *info = &ctx->source_args[arg];
    const RegisterType regtype = info->regtype;
    const int regnum = info->regnum;

    if (regtype != REG_TYPE_LABEL)
        failf(ctx, "%s with a non-label register specified", opcode);
    if (!shader_version_atleast(ctx, 2, 0))
        failf(ctx, "%s not supported in Shader Model 1", opcode);
    if ((shader_version_atleast(ctx, 2, 255)) && (regnum > 2047))
        fail(ctx, "label register number must be <= 2047");
    if (regnum > 15)
        fail(ctx, "label register number must be <= 15");
} // check_label_register

static void state_LABEL(Context *ctx)
{
    if (ctx->previous_opcode != OPCODE_RET)
        fail(ctx, "LABEL not followed by a RET");
    check_label_register(ctx, 0, "LABEL");
    set_defined_register(ctx, REG_TYPE_LABEL, ctx->source_args[0].regnum);
} // state_LABEL

static void check_call_loop_wrappage(Context *ctx, const int regnum)
{
    // msdn says subroutines inherit aL register if you're in a loop when
    //  you call, and further more _if you ever call this function in a loop,
    //  it must always be called in a loop_. So we'll just pass our loop
    //  variable as a function parameter in those cases.

    const int current_usage = (ctx->loops > 0) ? 1 : -1;
    RegisterList *reg = reglist_find(&ctx->used_registers, REG_TYPE_LABEL, regnum);
    assert(reg != NULL);

    if (reg->misc == 0)
        reg->misc = current_usage;
    else if (reg->misc != current_usage)
    {
        if (current_usage == 1)
            fail(ctx, "CALL to this label must be wrapped in LOOP/ENDLOOP");
        else
            fail(ctx, "CALL to this label must not be wrapped in LOOP/ENDLOOP");
    } // else if
} // check_call_loop_wrappage

static void state_CALL(Context *ctx)
{
    check_label_register(ctx, 0, "CALL");
    check_call_loop_wrappage(ctx, ctx->source_args[0].regnum);
} // state_CALL

static void state_CALLNZ(Context *ctx)
{
    const RegisterType regtype = ctx->source_args[1].regtype;
    if ((regtype != REG_TYPE_CONSTBOOL) && (regtype != REG_TYPE_PREDICATE))
        fail(ctx, "CALLNZ argument isn't constbool or predicate register");
    check_label_register(ctx, 0, "CALLNZ");
    check_call_loop_wrappage(ctx, ctx->source_args[0].regnum);
} // state_CALLNZ

static void state_MOVA(Context *ctx)
{
    if (ctx->dest_arg.regtype != REG_TYPE_ADDRESS)
        fail(ctx, "MOVA argument isn't address register");
} // state_MOVA

static void state_RCP(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "RCP without replicate swizzzle");
} // state_RCP

static void state_LOOP(Context *ctx)
{
    if (ctx->source_args[0].regtype != REG_TYPE_LOOP)
        fail(ctx, "LOOP argument isn't loop register");
    else if (ctx->source_args[1].regtype != REG_TYPE_CONSTINT)
        fail(ctx, "LOOP argument isn't constint register");
    else
        ctx->loops++;
} // state_LOOP

static void state_ENDLOOP(Context *ctx)
{
    // !!! FIXME: check that we aren't straddling an IF block.
    if (ctx->loops <= 0)
        fail(ctx, "ENDLOOP without LOOP");
    ctx->loops--;
} // state_ENDLOOP

static void state_BREAKP(Context *ctx)
{
    const RegisterType regtype = ctx->source_args[0].regtype;
    if (regtype != REG_TYPE_PREDICATE)
        fail(ctx, "BREAKP argument isn't predicate register");
    else if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "BREAKP without replicate swizzzle");
    else if ((ctx->loops == 0) && (ctx->reps == 0))
        fail(ctx, "BREAKP outside LOOP/ENDLOOP or REP/ENDREP");
} // state_BREAKP

static void state_BREAK(Context *ctx)
{
    if ((ctx->loops == 0) && (ctx->reps == 0))
        fail(ctx, "BREAK outside LOOP/ENDLOOP or REP/ENDREP");
} // state_BREAK

static void state_SETP(Context *ctx)
{
    const RegisterType regtype = ctx->dest_arg.regtype;
    if (regtype != REG_TYPE_PREDICATE)
        fail(ctx, "SETP argument isn't predicate register");
} // state_SETP

static void state_REP(Context *ctx)
{
    const RegisterType regtype = ctx->source_args[0].regtype;
    if (regtype != REG_TYPE_CONSTINT)
        fail(ctx, "REP argument isn't constint register");

    ctx->reps++;
    if (ctx->reps > ctx->max_reps)
        ctx->max_reps = ctx->reps;
} // state_REP

static void state_ENDREP(Context *ctx)
{
    // !!! FIXME: check that we aren't straddling an IF block.
    if (ctx->reps <= 0)
        fail(ctx, "ENDREP without REP");
    ctx->reps--;
} // state_ENDREP

static void state_CMP(Context *ctx)
{
    ctx->cmps++;

    // extra limitations for ps <= 1.4 ...
    if (!shader_version_atleast(ctx, 1, 4))
    {
        int i;
        const DestArgInfo *dst = &ctx->dest_arg;
        const RegisterType dregtype = dst->regtype;
        const int dregnum = dst->regnum;

        if (ctx->cmps > 3)
            fail(ctx, "only 3 CMP instructions allowed in this shader model");

        for (i = 0; i < 3; i++)
        {
            const SourceArgInfo *src = &ctx->source_args[i];
            const RegisterType sregtype = src->regtype;
            const int sregnum = src->regnum;
            if ((dregtype == sregtype) && (dregnum == sregnum))
                fail(ctx, "CMP dest can't match sources in this shader model");
        } // for

        ctx->instruction_count++;  // takes an extra slot in ps_1_2 and _3.
    } // if
} // state_CMP

static void state_DP4(Context *ctx)
{
    // extra limitations for ps <= 1.4 ...
    if (!shader_version_atleast(ctx, 1, 4))
        ctx->instruction_count++;  // takes an extra slot in ps_1_2 and _3.
} // state_DP4

static void state_CND(Context *ctx)
{
    // apparently it was removed...it's not in the docs past ps_1_4 ...
    if (shader_version_atleast(ctx, 2, 0))
        fail(ctx, "CND not allowed in this shader model");

    // extra limitations for ps <= 1.4 ...
    else if (!shader_version_atleast(ctx, 1, 4))
    {
        const SourceArgInfo *src = &ctx->source_args[0];
        if ((src->regtype != REG_TYPE_TEMP) || (src->regnum != 0) ||
            (src->swizzle != 0xFF))
        {
            fail(ctx, "CND src must be r0.a in this shader model");
        } // if
    } // if
} // state_CND

static void state_POW(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "POW src0 must have replicate swizzle");
    else if (!replicate_swizzle(ctx->source_args[1].swizzle))
        fail(ctx, "POW src1 must have replicate swizzle");
} // state_POW

static void state_LOG(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "LOG src0 must have replicate swizzle");
} // state_LOG

static void state_LOGP(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "LOGP src0 must have replicate swizzle");
} // state_LOGP

static void state_SINCOS(Context *ctx)
{
    const DestArgInfo *dst = &ctx->dest_arg;
    const int mask = dst->writemask;
    if (!writemask_x(mask) && !writemask_y(mask) && !writemask_xy(mask))
        fail(ctx, "SINCOS write mask must be .x or .y or .xy");

    else if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "SINCOS src0 must have replicate swizzle");

    else if (dst->result_mod & MOD_SATURATE)  // according to msdn...
        fail(ctx, "SINCOS destination can't use saturate modifier");

    // this opcode needs extra registers, with extra limitations, for <= sm2.
    else if (!shader_version_atleast(ctx, 3, 0))
    {
        int i;
        for (i = 1; i < 3; i++)
        {
            if (ctx->source_args[i].regtype != REG_TYPE_CONST)
            {
                failf(ctx, "SINCOS src%d must be constfloat", i);
                return;
            } // if
        } // for

        if (ctx->source_args[1].regnum == ctx->source_args[2].regnum)
            fail(ctx, "SINCOS src1 and src2 must be different registers");
    } // if
} // state_SINCOS

static void state_IF(Context *ctx)
{
    const RegisterType regtype = ctx->source_args[0].regtype;
    if ((regtype != REG_TYPE_PREDICATE) && (regtype != REG_TYPE_CONSTBOOL))
        fail(ctx, "IF src0 must be CONSTBOOL or PREDICATE");
    // !!! FIXME: track if nesting depth.
} // state_IF

static void state_IFC(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "IFC src0 must have replicate swizzle");
    else if (!replicate_swizzle(ctx->source_args[1].swizzle))
        fail(ctx, "IFC src1 must have replicate swizzle");
    // !!! FIXME: track if nesting depth.
} // state_IFC

static void state_BREAKC(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[0].swizzle))
        fail(ctx, "BREAKC src1 must have replicate swizzle");
    else if (!replicate_swizzle(ctx->source_args[1].swizzle))
        fail(ctx, "BREAKC src2 must have replicate swizzle");
    else if ((ctx->loops == 0) && (ctx->reps == 0))
        fail(ctx, "BREAKC outside LOOP/ENDLOOP or REP/ENDREP");
} // state_BREAKC

static void state_TEXKILL(Context *ctx)
{
    // The MSDN docs say this should be a source arg, but the driver docs
    //  say it's a dest arg. That's annoying.
    const DestArgInfo *info = &ctx->dest_arg;
    const RegisterType regtype = info->regtype;
    if (!writemask_xyzw(info->writemask))
        fail(ctx, "TEXKILL writemask must be .xyzw");
    else if ((regtype != REG_TYPE_TEMP) && (regtype != REG_TYPE_TEXTURE))
        fail(ctx, "TEXKILL must use a temp or texture register");

    // !!! FIXME: "If a temporary register is used, all components must have been previously written."
    // !!! FIXME: "If a texture register is used, all components that are read must have been declared."
    // !!! FIXME: there are further limitations in ps_1_3 and earlier.
} // state_TEXKILL

// Some rules that apply to some of the fruity ps_1_1 texture opcodes...
static void state_texops(Context *ctx, const char *opcode,
                         const int dims, const int texbem)
{
    const DestArgInfo *dst = &ctx->dest_arg;
    const SourceArgInfo *src = &ctx->source_args[0];
    if (dst->regtype != REG_TYPE_TEXTURE)
        failf(ctx, "%s destination must be a texture register", opcode);
    if (src->regtype != REG_TYPE_TEXTURE)
        failf(ctx, "%s source must be a texture register", opcode);
    if (src->regnum >= dst->regnum)  // so says MSDN.
        failf(ctx, "%s dest must be a higher register than source", opcode);

    if (dims)
    {
        TextureType ttyp = (dims == 2) ? TEXTURE_TYPE_2D : TEXTURE_TYPE_CUBE;
        add_sampler(ctx, dst->regnum, ttyp, texbem);
    } // if

    add_attribute_register(ctx, REG_TYPE_TEXTURE, dst->regnum,
                           MOJOSHADER_USAGE_TEXCOORD, dst->regnum, 0xF, 0);

    // Strictly speaking, there should be a TEX opcode prior to this call that
    //  should fill in this metadata, but I'm not sure that's required for the
    //  shader to assemble in D3D, so we'll do this so we don't fail with a
    //  cryptic error message even if the developer didn't do the TEX.
    add_attribute_register(ctx, REG_TYPE_TEXTURE, src->regnum,
                           MOJOSHADER_USAGE_TEXCOORD, src->regnum, 0xF, 0);
} // state_texops

static void state_texbem(Context *ctx, const char *opcode)
{
    // The TEXBEM equasion, according to MSDN:
    //u' = TextureCoordinates(stage m)u + D3DTSS_BUMPENVMAT00(stage m)*t(n)R
    //         + D3DTSS_BUMPENVMAT10(stage m)*t(n)G
    //v' = TextureCoordinates(stage m)v + D3DTSS_BUMPENVMAT01(stage m)*t(n)R
    //         + D3DTSS_BUMPENVMAT11(stage m)*t(n)G
    //t(m)RGBA = TextureSample(stage m)
    //
    // ...TEXBEML adds this at the end:
    //t(m)RGBA = t(m)RGBA * [(t(n)B * D3DTSS_BUMPENVLSCALE(stage m)) +
    //           D3DTSS_BUMPENVLOFFSET(stage m)]

    if (shader_version_atleast(ctx, 1, 4))
        failf(ctx, "%s opcode not available after Shader Model 1.3", opcode);

    if (!shader_version_atleast(ctx, 1, 2))
    {
        if (ctx->source_args[0].src_mod == SRCMOD_SIGN)
            failf(ctx, "%s forbids _bx2 on source reg before ps_1_2", opcode);
    } // if

    // !!! FIXME: MSDN:
    // !!! FIXME: Register data that has been read by a texbem
    // !!! FIXME:  or texbeml instruction cannot be read later,
    // !!! FIXME:  except by another texbem or texbeml.

    state_texops(ctx, opcode, 2, 1);
} // state_texbem

static void state_TEXBEM(Context *ctx)
{
    state_texbem(ctx, "TEXBEM");
} // state_TEXBEM

static void state_TEXBEML(Context *ctx)
{
    state_texbem(ctx, "TEXBEML");
} // state_TEXBEML

static void state_TEXM3X2PAD(Context *ctx)
{
    if (shader_version_atleast(ctx, 1, 4))
        fail(ctx, "TEXM3X2PAD opcode not available after Shader Model 1.3");
    state_texops(ctx, "TEXM3X2PAD", 0, 0);
    // !!! FIXME: check for correct opcode existance and order more rigorously?
    ctx->texm3x2pad_src0 = ctx->source_args[0].regnum;
    ctx->texm3x2pad_dst0 = ctx->dest_arg.regnum;
} // state_TEXM3X2PAD

static void state_TEXM3X2TEX(Context *ctx)
{
    if (shader_version_atleast(ctx, 1, 4))
        fail(ctx, "TEXM3X2TEX opcode not available after Shader Model 1.3");
    if (ctx->texm3x2pad_dst0 == -1)
        fail(ctx, "TEXM3X2TEX opcode without matching TEXM3X2PAD");
    // !!! FIXME: check for correct opcode existance and order more rigorously?
    state_texops(ctx, "TEXM3X2TEX", 2, 0);
    ctx->reset_texmpad = 1;

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      ctx->dest_arg.regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);

    // A samplermap might change this to something nonsensical.
    if (ttype != TEXTURE_TYPE_2D)
        fail(ctx, "TEXM3X2TEX needs a 2D sampler");
} // state_TEXM3X2TEX

static void state_TEXM3X3PAD(Context *ctx)
{
    if (shader_version_atleast(ctx, 1, 4))
        fail(ctx, "TEXM3X2TEX opcode not available after Shader Model 1.3");
    state_texops(ctx, "TEXM3X3PAD", 0, 0);

    // !!! FIXME: check for correct opcode existance and order more rigorously?
    if (ctx->texm3x3pad_dst0 == -1)
    {
        ctx->texm3x3pad_src0 = ctx->source_args[0].regnum;
        ctx->texm3x3pad_dst0 = ctx->dest_arg.regnum;
    } // if
    else if (ctx->texm3x3pad_dst1 == -1)
    {
        ctx->texm3x3pad_src1 = ctx->source_args[0].regnum;
        ctx->texm3x3pad_dst1 = ctx->dest_arg.regnum;
    } // else
} // state_TEXM3X3PAD

static void state_texm3x3(Context *ctx, const char *opcode, const int dims)
{
    // !!! FIXME: check for correct opcode existance and order more rigorously?
    if (shader_version_atleast(ctx, 1, 4))
        failf(ctx, "%s opcode not available after Shader Model 1.3", opcode);
    if (ctx->texm3x3pad_dst1 == -1)
        failf(ctx, "%s opcode without matching TEXM3X3PADs", opcode);
    state_texops(ctx, opcode, dims, 0);
    ctx->reset_texmpad = 1;

    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
                                      ctx->dest_arg.regnum);
    const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);

    // A samplermap might change this to something nonsensical.
    if ((ttype != TEXTURE_TYPE_VOLUME) && (ttype != TEXTURE_TYPE_CUBE))
        failf(ctx, "%s needs a 3D or Cubemap sampler", opcode);
} // state_texm3x3

static void state_TEXM3X3(Context *ctx)
{
    if (!shader_version_atleast(ctx, 1, 2))
        fail(ctx, "TEXM3X3 opcode not available in Shader Model 1.1");
    state_texm3x3(ctx, "TEXM3X3", 0);
} // state_TEXM3X3

static void state_TEXM3X3TEX(Context *ctx)
{
    state_texm3x3(ctx, "TEXM3X3TEX", 3);
} // state_TEXM3X3TEX

static void state_TEXM3X3SPEC(Context *ctx)
{
    state_texm3x3(ctx, "TEXM3X3SPEC", 3);
    if (ctx->source_args[1].regtype != REG_TYPE_CONST)
        fail(ctx, "TEXM3X3SPEC final arg must be a constant register");
} // state_TEXM3X3SPEC

static void state_TEXM3X3VSPEC(Context *ctx)
{
    state_texm3x3(ctx, "TEXM3X3VSPEC", 3);
} // state_TEXM3X3VSPEC


static void state_TEXLD(Context *ctx)
{
    if (shader_version_atleast(ctx, 2, 0))
    {
        const SourceArgInfo *src0 = &ctx->source_args[0];
        const SourceArgInfo *src1 = &ctx->source_args[1];

        // !!! FIXME: verify texldp restrictions:
        //http://msdn.microsoft.com/en-us/library/bb206221(VS.85).aspx
        // !!! FIXME: ...and texldb, too.
        //http://msdn.microsoft.com/en-us/library/bb206217(VS.85).aspx

        //const RegisterType rt0 = src0->regtype;

        // !!! FIXME: msdn says it has to be temp, but Microsoft's HLSL
        // !!! FIXME:  compiler is generating code that uses oC0 for a dest.
        //if (ctx->dest_arg.regtype != REG_TYPE_TEMP)
        //    fail(ctx, "TEXLD dest must be a temp register");

        // !!! FIXME: this can be an REG_TYPE_INPUT, DCL'd to TEXCOORD.
        //else if ((rt0 != REG_TYPE_TEXTURE) && (rt0 != REG_TYPE_TEMP))
        //    fail(ctx, "TEXLD src0 must be texture or temp register");
        //else

        if (src0->src_mod != SRCMOD_NONE)
            fail(ctx, "TEXLD src0 must have no modifiers");
        else if (src1->regtype != REG_TYPE_SAMPLER)
            fail(ctx, "TEXLD src1 must be sampler register");
        else if (src1->src_mod != SRCMOD_NONE)
            fail(ctx, "TEXLD src1 must have no modifiers");
        else if ( (ctx->instruction_controls != CONTROL_TEXLD) &&
                  (ctx->instruction_controls != CONTROL_TEXLDP) &&
                  (ctx->instruction_controls != CONTROL_TEXLDB) )
        {
            fail(ctx, "TEXLD has unknown control bits");
        } // else if

        // Shader Model 3 added swizzle support to this opcode.
        if (!shader_version_atleast(ctx, 3, 0))
        {
            if (!no_swizzle(src0->swizzle))
                fail(ctx, "TEXLD src0 must not swizzle");
            else if (!no_swizzle(src1->swizzle))
                fail(ctx, "TEXLD src1 must not swizzle");
        } // if

        if ( ((TextureType) ctx->source_args[1].regnum) == TEXTURE_TYPE_CUBE )
            ctx->instruction_count += 3;
    } // if

    else if (shader_version_atleast(ctx, 1, 4))
    {
        // !!! FIXME: checks for ps_1_4 version here...
    } // else if

    else
    {
        // !!! FIXME: add (other?) checks for ps_1_1 version here...
        const DestArgInfo *info = &ctx->dest_arg;
        const int sampler = info->regnum;
        if (info->regtype != REG_TYPE_TEXTURE)
            fail(ctx, "TEX param must be a texture register");
        add_sampler(ctx, sampler, TEXTURE_TYPE_2D, 0);
        add_attribute_register(ctx, REG_TYPE_TEXTURE, sampler,
                               MOJOSHADER_USAGE_TEXCOORD, sampler, 0xF, 0);
    } // else
} // state_TEXLD

static void state_TEXLDL(Context *ctx)
{
    if (!shader_version_atleast(ctx, 3, 0))
        fail(ctx, "TEXLDL in version < Shader Model 3.0");
    else if (ctx->source_args[1].regtype != REG_TYPE_SAMPLER)
        fail(ctx, "TEXLDL src1 must be sampler register");
    else
    {
        if ( ((TextureType) ctx->source_args[1].regnum) == TEXTURE_TYPE_CUBE )
            ctx->instruction_count += 3;
    } // else
} // state_TEXLDL

static void state_DP2ADD(Context *ctx)
{
    if (!replicate_swizzle(ctx->source_args[2].swizzle))
        fail(ctx, "DP2ADD src2 must have replicate swizzle");
} // state_DP2ADD


// Lookup table for instruction opcodes...
typedef struct
{
    const char *opcode_string;
    int slots;  // number of instruction slots this opcode eats.
    MOJOSHADER_shaderType shader_types;  // mask of types that can use opcode.
    args_function parse_args;
    state_function state;
    emit_function emitter[STATICARRAYLEN(profiles)];
} Instruction;

// These have to be in the right order! This array is indexed by the value
//  of the instruction token.
static const Instruction instructions[] =
{
    #define INSTRUCTION_STATE(op, opstr, slots, a, t) { \
        opstr, slots, t, parse_args_##a, state_##op, PROFILE_EMITTERS(op) \
    },

    #define INSTRUCTION(op, opstr, slots, a, t) { \
        opstr, slots, t, parse_args_##a, 0, PROFILE_EMITTERS(op) \
    },

    #define MOJOSHADER_DO_INSTRUCTION_TABLE 1
    #include "mojoshader_internal.h"
    #undef MOJOSHADER_DO_INSTRUCTION_TABLE

    #undef INSTRUCTION
    #undef INSTRUCTION_STATE
};


// parse various token types...

static int parse_instruction_token(Context *ctx)
{
    int retval = 0;
    const int start_position = ctx->current_position;
    const uint32 *start_tokens = ctx->tokens;
    const uint32 start_tokencount = ctx->tokencount;
    const uint32 token = SWAP32(*(ctx->tokens));
    const uint32 opcode = (token & 0xFFFF);
    const uint32 controls = ((token >> 16) & 0xFF);
    const uint32 insttoks = ((token >> 24) & 0x0F);
    const int coissue = (token & 0x40000000) ? 1 : 0;
    const int predicated = (token & 0x10000000) ? 1 : 0;

    if ( opcode >= (sizeof (instructions) / sizeof (instructions[0])) )
        return 0;  // not an instruction token, or just not handled here.

    const Instruction *instruction = &instructions[opcode];
    const emit_function emitter = instruction->emitter[ctx->profileid];

    if ((token & 0x80000000) != 0)
        fail(ctx, "instruction token high bit must be zero.");  // so says msdn.

    if (instruction->opcode_string == NULL)
    {
        fail(ctx, "Unknown opcode.");
        return insttoks + 1;  // pray that you resync later.
    } // if

    ctx->coissue = coissue;
    if (coissue)
    {
        if (!shader_is_pixel(ctx))
            fail(ctx, "coissue instruction on non-pixel shader");
        if (shader_version_atleast(ctx, 2, 0))
            fail(ctx, "coissue instruction in Shader Model >= 2.0");
    } // if

    if ((ctx->shader_type & instruction->shader_types) == 0)
    {
        failf(ctx, "opcode '%s' not available in this shader type.",
                instruction->opcode_string);
    } // if

    memset(ctx->dwords, '\0', sizeof (ctx->dwords));
    ctx->instruction_controls = controls;
    ctx->predicated = predicated;

    // Update the context with instruction's arguments.
    adjust_token_position(ctx, 1);
    retval = instruction->parse_args(ctx);

    if (predicated)
        retval += parse_predicated_token(ctx);

    // parse_args() moves these forward for convenience...reset them.
    ctx->tokens = start_tokens;
    ctx->tokencount = start_tokencount;
    ctx->current_position = start_position;

    if (instruction->state != NULL)
        instruction->state(ctx);

    ctx->instruction_count += instruction->slots;

    if (!isfail(ctx))
        emitter(ctx);  // call the profile's emitter.

    if (ctx->reset_texmpad)
    {
        ctx->texm3x2pad_dst0 = -1;
        ctx->texm3x2pad_src0 = -1;
        ctx->texm3x3pad_dst0 = -1;
        ctx->texm3x3pad_src0 = -1;
        ctx->texm3x3pad_dst1 = -1;
        ctx->texm3x3pad_src1 = -1;
        ctx->reset_texmpad = 0;
    } // if

    ctx->previous_opcode = opcode;
    ctx->scratch_registers = 0;  // reset after every instruction.

    if (!shader_version_atleast(ctx, 2, 0))
    {
        if (insttoks != 0)  // reserved field in shaders < 2.0 ...
            fail(ctx, "instruction token count must be zero");
    } // if
    else
    {
        if (((uint32)retval) != (insttoks+1))
        {
            failf(ctx, "wrong token count (%u, not %u) for opcode '%s'.",
                    (uint) retval, (uint) (insttoks+1),
                    instruction->opcode_string);
            retval = insttoks + 1;  // try to keep sync.
        } // if
    } // else

    return retval;
} // parse_instruction_token


static int parse_version_token(Context *ctx, const char *profilestr)
{
    if (ctx->tokencount == 0)
    {
        fail(ctx, "Expected version token, got none at all.");
        return 0;
    } // if

    const uint32 token = SWAP32(*(ctx->tokens));
    const uint32 shadertype = ((token >> 16) & 0xFFFF);
    const uint8 major = (uint8) ((token >> 8) & 0xFF);
    const uint8 minor = (uint8) (token & 0xFF);

    ctx->version_token = token;

    // 0xFFFF == pixel shader, 0xFFFE == vertex shader
    if (shadertype == 0xFFFF)
    {
        ctx->shader_type = MOJOSHADER_TYPE_PIXEL;
        ctx->shader_type_str = "ps";
    } // if
    else if (shadertype == 0xFFFE)
    {
        ctx->shader_type = MOJOSHADER_TYPE_VERTEX;
        ctx->shader_type_str = "vs";
    } // else if
    else  // geometry shader? Bogus data?
    {
        fail(ctx, "Unsupported shader type or not a shader at all");
        return -1;
    } // else

    ctx->major_ver = major;
    ctx->minor_ver = minor;

    if (!shader_version_supported(major, minor))
    {
        failf(ctx, "Shader Model %u.%u is currently unsupported.",
                (uint) major, (uint) minor);
    } // if

    if (!isfail(ctx))
        ctx->profile->start_emitter(ctx, profilestr);

    return 1;  // ate one token.
} // parse_version_token


static int parse_ctab_string(const uint8 *start, const uint32 bytes,
                             const uint32 name)
{
    // Make sure strings don't overflow the CTAB buffer...
    if (name < bytes)
    {
        int i;
        const int slenmax = bytes - name;
        const char *namestr = (const char *) (start + name);
        for (i = 0; i < slenmax; i++)
        {
            if (namestr[i] == '\0')
                return 1;  // it's okay.
        } // for
    } // if

    return 0;  // overflowed.
} // parse_ctab_string


static int parse_ctab_typeinfo(Context *ctx, const uint8 *start,
                               const uint32 bytes, const uint32 pos,
                               MOJOSHADER_symbolTypeInfo *info)
{
    if ((pos + 16) >= bytes)
        return 0;  // corrupt CTAB.

    const uint16 *typeptr = (const uint16 *) (start + pos);

    info->parameter_class = (MOJOSHADER_symbolClass) SWAP16(typeptr[0]);
    info->parameter_type = (MOJOSHADER_symbolType) SWAP16(typeptr[1]);
    info->rows = (unsigned int) SWAP16(typeptr[2]);
    info->columns = (unsigned int) SWAP16(typeptr[3]);
    info->elements = (unsigned int) SWAP16(typeptr[4]);
    info->member_count = (unsigned int) SWAP16(typeptr[5]);

    if ((pos + 16 + (info->member_count * 8)) >= bytes)
        return 0;  // corrupt CTAB.

    if (info->member_count == 0)
        info->members = NULL;
    else
    {
        const size_t len = sizeof (MOJOSHADER_symbolStructMember) *
                            info->member_count;
        info->members = (MOJOSHADER_symbolStructMember *) Malloc(ctx, len);
        if (info->members == NULL)
            return 1;  // we'll check ctx->out_of_memory later.
        memset(info->members, '\0', len);
    } // else

    int i;
    const uint32 *member = (const uint32 *) (start + typeptr[6]);
    for (i = 0; i < info->member_count; i++)
    {
        MOJOSHADER_symbolStructMember *mbr = &info->members[i];
        const uint32 name = SWAP32(member[0]);
        const uint32 memberinfopos = SWAP32(member[1]);
        member += 2;

        if (!parse_ctab_string(start, bytes, name))
            return 0;  // info->members will be free()'d elsewhere.

        mbr->name = StrDup(ctx, (const char *) (start + name));
        if (mbr->name == NULL)
            return 1;  // we'll check ctx->out_of_memory later.
        if (!parse_ctab_typeinfo(ctx, start, bytes, memberinfopos, &mbr->info))
            return 0;
        if (ctx->out_of_memory)
            return 1;  // drop out now.
    } // for

    return 1;
} // parse_ctab_typeinfo


// Microsoft's tools add a CTAB comment to all shaders. This is the
//  "constant table," or specifically: D3DXSHADER_CONSTANTTABLE:
//  http://msdn.microsoft.com/en-us/library/bb205440(VS.85).aspx
// This may tell us high-level truths about an otherwise generic low-level
//  registers, for instance, how large an array actually is, etc.
static void parse_constant_table(Context *ctx, const uint32 *tokens,
                                 const uint32 bytes, const uint32 okay_version,
                                 const int setvariables, CtabData *ctab)
{
    const uint32 id = SWAP32(tokens[1]);
    if (id != CTAB_ID)
        return;  // not the constant table.

    assert(ctab->have_ctab == 0);  // !!! FIXME: can you have more than one?
    ctab->have_ctab = 1;

    const uint8 *start = (uint8 *) &tokens[2];

    if (bytes < 32)
    {
        fail(ctx, "Truncated CTAB data");
        return;
    } // if

    const uint32 size = SWAP32(tokens[2]);
    const uint32 creator = SWAP32(tokens[3]);
    const uint32 version = SWAP32(tokens[4]);
    const uint32 constants = SWAP32(tokens[5]);
    const uint32 constantinfo = SWAP32(tokens[6]);
    const uint32 target = SWAP32(tokens[8]);

    if (size != CTAB_SIZE)
        goto corrupt_ctab;

    if (version != okay_version) goto corrupt_ctab;
    if (creator >= bytes) goto corrupt_ctab;
    if ((constantinfo + (constants * CINFO_SIZE)) >= bytes) goto corrupt_ctab;
    if (target >= bytes) goto corrupt_ctab;
    if (!parse_ctab_string(start, bytes, target)) goto corrupt_ctab;
    // !!! FIXME: check that (start+target) points to "ps_3_0", etc.

    ctab->symbol_count = constants;
    ctab->symbols = (MOJOSHADER_symbol *) Malloc(ctx, sizeof (MOJOSHADER_symbol) * constants);
    if (ctab->symbols == NULL)
        return;
    memset(ctab->symbols, '\0', sizeof (MOJOSHADER_symbol) * constants);

    uint32 i = 0;
    for (i = 0; i < constants; i++)
    {
        const uint8 *ptr = start + constantinfo + (i * CINFO_SIZE);
        const uint32 name = SWAP32(*((uint32 *) (ptr + 0)));
        const uint16 regset = SWAP16(*((uint16 *) (ptr + 4)));
        const uint16 regidx = SWAP16(*((uint16 *) (ptr + 6)));
        const uint16 regcnt = SWAP16(*((uint16 *) (ptr + 8)));
        const uint32 typeinf = SWAP32(*((uint32 *) (ptr + 12)));
        const uint32 defval = SWAP32(*((uint32 *) (ptr + 16)));
        MOJOSHADER_uniformType mojotype = MOJOSHADER_UNIFORM_UNKNOWN;

        if (!parse_ctab_string(start, bytes, name)) goto corrupt_ctab;
        if (defval >= bytes) goto corrupt_ctab;

        switch (regset)
        {
            case 0: mojotype = MOJOSHADER_UNIFORM_BOOL; break;
            case 1: mojotype = MOJOSHADER_UNIFORM_INT; break;
            case 2: mojotype = MOJOSHADER_UNIFORM_FLOAT; break;
            case 3: /* SAMPLER */ break;
            default: goto corrupt_ctab;
        } // switch

        if ((setvariables) && (mojotype != MOJOSHADER_UNIFORM_UNKNOWN))
        {
            VariableList *item;
            item = (VariableList *) Malloc(ctx, sizeof (VariableList));
            if (item != NULL)
            {
                item->type = mojotype;
                item->index = regidx;
                item->count = regcnt;
                item->constant = NULL;
                item->used = 0;
                item->emit_position = -1;
                item->next = ctx->variables;
                ctx->variables = item;
            } // if
        } // if

        // Add the symbol.
        const char *namecpy = StrDup(ctx, (const char *) (start + name));
        if (namecpy == NULL)
            return;

        MOJOSHADER_symbol *sym = &ctab->symbols[i];
        sym->name = namecpy;
        sym->register_set = (MOJOSHADER_symbolRegisterSet) regset;
        sym->register_index = (unsigned int) regidx;
        sym->register_count = (unsigned int) regcnt;
        if (!parse_ctab_typeinfo(ctx, start, bytes, typeinf, &sym->info))
            goto corrupt_ctab;  // sym->name will get free()'d later.
        else if (ctx->out_of_memory)
            return;  // just bail now.
    } // for

    return;

corrupt_ctab:
    fail(ctx, "Shader has corrupt CTAB data");
} // parse_constant_table


static void free_symbols(MOJOSHADER_free f, void *d, MOJOSHADER_symbol *syms,
                         const int symcount);


static int is_comment_token(Context *ctx, const uint32 tok, uint32 *tokcount)
{
    const uint32 token = SWAP32(tok);
    if ((token & 0xFFFF) == 0xFFFE)  // actually a comment token?
    {
        if ((token & 0x80000000) != 0)
            fail(ctx, "comment token high bit must be zero.");  // so says msdn.
        *tokcount = ((token >> 16) & 0xFFFF);
        return 1;
    } // if

    return 0;
} // is_comment_token


typedef struct PreshaderBlockInfo
{
    const uint32 *tokens;
    uint32 tokcount;
    int seen;
} PreshaderBlockInfo;

// Preshaders only show up in compiled Effect files. The format is
//  undocumented, and even the instructions aren't the same opcodes as you
//  would find in a regular shader. These things show up because the HLSL
//  compiler can detect work that sets up constant registers that could
//  be moved out of the shader itself. Preshaders run once, then the shader
//  itself runs many times, using the constant registers the preshader has set
//  up. There are cases where the preshaders are 3+ times as many instructions
//  as the shader itself, so this can be a big performance win.
// My presumption is that Microsoft's Effects framework runs the preshaders on
//  the CPU, then loads the constant register file appropriately before handing
//  off to the GPU. As such, we do the same.
static void parse_preshader(Context *ctx, uint32 tokcount)
{
    const uint32 *tokens = ctx->tokens;
    if ((tokcount < 2) || (SWAP32(tokens[1]) != PRES_ID))
        return;  // not a preshader.

#ifndef MOJOSHADER_EFFECT_SUPPORT
    fail(ctx, "Preshader found, but effect support is disabled!");
#else

    assert(ctx->have_preshader == 0);  // !!! FIXME: can you have more than one?
    ctx->have_preshader = 1;

    // !!! FIXME: I don't know what specific versions signify, but we need to
    // !!! FIXME:  save this to test against the CTAB version field, if
    // !!! FIXME:  nothing else.
    // !!! FIXME: 0x02 0x0? is probably the version (fx_2_?),
    // !!! FIXME:  and 0x4658 is the magic, like a real shader's version token.
    const uint32 min_version = 0x46580200;
    const uint32 max_version = 0x46580201;
    const uint32 version = SWAP32(tokens[2]);
    if (version < min_version || version > max_version)
    {
        fail(ctx, "Unsupported preshader version.");
        return;  // fail because the shader will malfunction w/o this.
    } // if

    tokens += 3;
    tokcount -= 3;

    // All sections of a preshader are packed into separate comment tokens,
    //  inside the containing comment token block. Find them all before
    //  we start, so we don't care about the order they appear in the file.
    PreshaderBlockInfo ctab = { 0, 0, 0 };
    PreshaderBlockInfo prsi = { 0, 0, 0 };
    PreshaderBlockInfo fxlc = { 0, 0, 0 };
    PreshaderBlockInfo clit = { 0, 0, 0 };

    while (tokcount > 0)
    {
        uint32 subtokcount = 0;
        if ( (!is_comment_token(ctx, *tokens, &subtokcount)) ||
             (subtokcount > tokcount) )
        {
            fail(ctx, "Bogus preshader data.");
            return;
        } // if

        tokens++;
        tokcount--;

        const uint32 *nexttokens = tokens + subtokcount;
        const uint32 nexttokcount = tokcount - subtokcount;

        if (subtokcount > 0)
        {
            switch (SWAP32(*tokens))
            {
                #define PRESHADER_BLOCK_CASE(id, var) \
                    case id##_ID: { \
                        if (var.seen) { \
                            fail(ctx, "Multiple " #id " preshader blocks."); \
                            return; \
                        } \
                        var.tokens = tokens; \
                        var.tokcount = subtokcount; \
                        var.seen = 1; \
                        break; \
                    }
                PRESHADER_BLOCK_CASE(CTAB, ctab);
                PRESHADER_BLOCK_CASE(PRSI, prsi);
                PRESHADER_BLOCK_CASE(FXLC, fxlc);
                PRESHADER_BLOCK_CASE(CLIT, clit);
                default: fail(ctx, "Bogus preshader section."); return;
                #undef PRESHADER_BLOCK_CASE
            } // switch
        } // if

        tokens = nexttokens;
        tokcount = nexttokcount;
    } // while

    if (!ctab.seen) { fail(ctx, "No CTAB block in preshader."); return; }
    if (!prsi.seen) { fail(ctx, "No PRSI block in preshader."); return; }
    if (!fxlc.seen) { fail(ctx, "No FXLC block in preshader."); return; }
    if (!clit.seen) { fail(ctx, "No CLIT block in preshader."); return; }

    MOJOSHADER_preshader *preshader = (MOJOSHADER_preshader *)
                                    Malloc(ctx, sizeof (MOJOSHADER_preshader));
    if (preshader == NULL)
        return;
    memset(preshader, '\0', sizeof (MOJOSHADER_preshader));
    ctx->preshader = preshader;

    // Let's set up the constant literals first...
    if (clit.tokcount == 0)
        fail(ctx, "Bogus CLIT block in preshader.");
    else
    {
        const uint32 lit_count = SWAP32(clit.tokens[1]);
        if (lit_count > ((clit.tokcount - 2) / 2))
        {
            fail(ctx, "Bogus CLIT block in preshader.");
            return;
        } // if
        else if (lit_count > 0)
        {
            preshader->literal_count = (unsigned int) lit_count;
            assert(sizeof (double) == 8);  // just in case.
            const size_t len = sizeof (double) * lit_count;
            preshader->literals = (double *) Malloc(ctx, len);
            if (preshader->literals == NULL)
                return;  // oh well.
            const double *litptr = (const double *) (clit.tokens + 2);
            int i;
            for (i = 0; i < lit_count; i++)
                preshader->literals[i] = SWAPDBL(litptr[i]);
        } // else if
    } // else

    // Parse out the PRSI block. This is used to map the output registers.
    if (prsi.tokcount < 8)
    {
        fail(ctx, "Bogus preshader PRSI data");
        return;
    } // if

    //const uint32 first_output_reg = SWAP32(prsi.tokens[1]);
    // !!! FIXME: there are a lot of fields here I don't know about.
    // !!! FIXME:  maybe [2] and [3] are for int4 and bool registers?
    //const uint32 output_reg_count = SWAP32(prsi.tokens[4]);
    // !!! FIXME:  maybe [5] and [6] are for int4 and bool registers?
    const uint32 output_map_count = SWAP32(prsi.tokens[7]);

    prsi.tokcount -= 8;
    prsi.tokens += 8;

    if (prsi.tokcount < ((output_map_count + 1) * 2))
    {
        fail(ctx, "Bogus preshader PRSI data");
        return;
    } // if

    const uint32 *output_map = prsi.tokens;

    // Now we'll figure out the CTAB...
    CtabData ctabdata = { 0, 0, 0 };
    parse_constant_table(ctx, ctab.tokens - 1, ctab.tokcount * 4,
                         version, 0, &ctabdata);

    // preshader owns this now. Don't free it in this function.
    preshader->symbol_count = ctabdata.symbol_count;
    preshader->symbols = ctabdata.symbols;

    if (!ctabdata.have_ctab)
    {
        fail(ctx, "Bogus preshader CTAB data");
        return;
    } // if

    // The FXLC block has the actual instructions...
    uint32 opcode_count = SWAP32(fxlc.tokens[1]);

    size_t len = sizeof (MOJOSHADER_preshaderInstruction) * opcode_count;
    preshader->instruction_count = (unsigned int) opcode_count;
    preshader->instructions = (MOJOSHADER_preshaderInstruction *)
                                Malloc(ctx, len);
    if (preshader->instructions == NULL)
        return;
    memset(preshader->instructions, '\0', len);

    fxlc.tokens += 2;
    fxlc.tokcount -= 2;
    if (opcode_count > (fxlc.tokcount / 2))
    {
        fail(ctx, "Bogus preshader FXLC block.");
        return;
    } // if

    MOJOSHADER_preshaderInstruction *inst = preshader->instructions;
    while (opcode_count--)
    {
        const uint32 opcodetok = SWAP32(fxlc.tokens[0]);
        MOJOSHADER_preshaderOpcode opcode = MOJOSHADER_PRESHADEROP_NOP;
        switch ((opcodetok >> 16) & 0xFFFF)
        {
            case 0x1000: opcode = MOJOSHADER_PRESHADEROP_MOV; break;
            case 0x1010: opcode = MOJOSHADER_PRESHADEROP_NEG; break;
            case 0x1030: opcode = MOJOSHADER_PRESHADEROP_RCP; break;
            case 0x1040: opcode = MOJOSHADER_PRESHADEROP_FRC; break;
            case 0x1050: opcode = MOJOSHADER_PRESHADEROP_EXP; break;
            case 0x1060: opcode = MOJOSHADER_PRESHADEROP_LOG; break;
            case 0x1070: opcode = MOJOSHADER_PRESHADEROP_RSQ; break;
            case 0x1080: opcode = MOJOSHADER_PRESHADEROP_SIN; break;
            case 0x1090: opcode = MOJOSHADER_PRESHADEROP_COS; break;
            case 0x10A0: opcode = MOJOSHADER_PRESHADEROP_ASIN; break;
            case 0x10B0: opcode = MOJOSHADER_PRESHADEROP_ACOS; break;
            case 0x10C0: opcode = MOJOSHADER_PRESHADEROP_ATAN; break;
            case 0x2000: opcode = MOJOSHADER_PRESHADEROP_MIN; break;
            case 0x2010: opcode = MOJOSHADER_PRESHADEROP_MAX; break;
            case 0x2020: opcode = MOJOSHADER_PRESHADEROP_LT; break;
            case 0x2030: opcode = MOJOSHADER_PRESHADEROP_GE; break;
            case 0x2040: opcode = MOJOSHADER_PRESHADEROP_ADD; break;
            case 0x2050: opcode = MOJOSHADER_PRESHADEROP_MUL; break;
            case 0x2060: opcode = MOJOSHADER_PRESHADEROP_ATAN2; break;
            case 0x2080: opcode = MOJOSHADER_PRESHADEROP_DIV; break;
            case 0x3000: opcode = MOJOSHADER_PRESHADEROP_CMP; break;
            case 0x3010: opcode = MOJOSHADER_PRESHADEROP_MOVC; break;
            case 0x5000: opcode = MOJOSHADER_PRESHADEROP_DOT; break;
            case 0x5020: opcode = MOJOSHADER_PRESHADEROP_NOISE; break;
            case 0xA000: opcode = MOJOSHADER_PRESHADEROP_MIN_SCALAR; break;
            case 0xA010: opcode = MOJOSHADER_PRESHADEROP_MAX_SCALAR; break;
            case 0xA020: opcode = MOJOSHADER_PRESHADEROP_LT_SCALAR; break;
            case 0xA030: opcode = MOJOSHADER_PRESHADEROP_GE_SCALAR; break;
            case 0xA040: opcode = MOJOSHADER_PRESHADEROP_ADD_SCALAR; break;
            case 0xA050: opcode = MOJOSHADER_PRESHADEROP_MUL_SCALAR; break;
            case 0xA060: opcode = MOJOSHADER_PRESHADEROP_ATAN2_SCALAR; break;
            case 0xA080: opcode = MOJOSHADER_PRESHADEROP_DIV_SCALAR; break;
            case 0xD000: opcode = MOJOSHADER_PRESHADEROP_DOT_SCALAR; break;
            case 0xD020: opcode = MOJOSHADER_PRESHADEROP_NOISE_SCALAR; break;
            default: fail(ctx, "Unknown preshader opcode."); break;
        } // switch

        uint32 operand_count = SWAP32(fxlc.tokens[1]) + 1;  // +1 for dest.

        inst->opcode = opcode;
        inst->element_count = (unsigned int) (opcodetok & 0xFF);
        inst->operand_count = (unsigned int) operand_count;

        fxlc.tokens += 2;
        fxlc.tokcount -= 2;
        if ((operand_count * 3) > fxlc.tokcount)
        {
            fail(ctx, "Bogus preshader FXLC block.");
            return;
        } // if

        MOJOSHADER_preshaderOperand *operand = inst->operands;
        while (operand_count--)
        {
            const unsigned int item = (unsigned int) SWAP32(fxlc.tokens[2]);

            // !!! FIXME: don't know what first token does.
            switch (SWAP32(fxlc.tokens[1]))
            {
                case 1:  // literal from CLIT block.
                {
                    if (item > preshader->literal_count)
                    {
                        fail(ctx, "Bogus preshader literal index.");
                        break;
                    } // if
                    operand->type = MOJOSHADER_PRESHADEROPERAND_LITERAL;
                    break;
                } // case

                case 2:  // item from ctabdata.
                {
                    int i;
                    MOJOSHADER_symbol *sym = ctabdata.symbols;
                    for (i = 0; i < ctabdata.symbol_count; i++, sym++)
                    {
                        const uint32 base = sym->register_index * 4;
                        const uint32 count = sym->register_count * 4;
                        assert(sym->register_set==MOJOSHADER_SYMREGSET_FLOAT4);
                        if ( (base <= item) && ((base + count) > item) )
                            break;
                    } // for
                    if (i == ctabdata.symbol_count)
                    {
                        fail(ctx, "Bogus preshader input index.");
                        break;
                    } // if
                    operand->type = MOJOSHADER_PRESHADEROPERAND_INPUT;
                    break;
                } // case

                case 4:
                {
                    int i;
                    for (i = 0; i < output_map_count; i++)
                    {
                        const uint32 base = output_map[(i*2)] * 4;
                        const uint32 count = output_map[(i*2)+1] * 4;
                        if ( (base <= item) && ((base + count) > item) )
                            break;
                    } // for
                    if (i == output_map_count)
                    {
                        fail(ctx, "Bogus preshader output index.");
                        break;
                    } // if

                    operand->type = MOJOSHADER_PRESHADEROPERAND_OUTPUT;
                    break;
                } // case

                case 7:
                {
                    operand->type = MOJOSHADER_PRESHADEROPERAND_TEMP;
                    if (item >= preshader->temp_count)
                        preshader->temp_count = item + 1;
                    break;
                } // case
            } // switch

            operand->index = item;

            fxlc.tokens += 3;
            fxlc.tokcount -= 3;
            operand++;
        } // while

        inst++;
    } // while

    // Registers need to be vec4, round up to nearest 4
    preshader->temp_count = (preshader->temp_count + 3) & ~3;

    unsigned int largest = 0;
    const MOJOSHADER_symbol *sym = preshader->symbols;
    int i;
    for (i = 0; i < preshader->symbol_count; i++, sym++)
    {
        const unsigned int val = sym->register_index + sym->register_count;
        if (val > largest)
            largest = val;
    } // for

    if (largest > 0)
    {
        const size_t len = largest * sizeof (float) * 4;
        preshader->registers = (float *) Malloc(ctx, len);
        memset(preshader->registers, '\0', len);
        preshader->register_count = largest;
    } // if
#endif
} // parse_preshader


static int parse_comment_token(Context *ctx)
{
    uint32 commenttoks = 0;
    if (is_comment_token(ctx, *ctx->tokens, &commenttoks))
    {
        if ((commenttoks >= 1) && (commenttoks < ctx->tokencount))
        {
            const uint32 id = SWAP32(ctx->tokens[1]);
            if (id == PRES_ID)
                parse_preshader(ctx, commenttoks);
            else if (id == CTAB_ID)
            {
                parse_constant_table(ctx, ctx->tokens, commenttoks * 4,
                                     ctx->version_token, 1, &ctx->ctab);
            } // else if
        } // if
        return commenttoks + 1;  // comment data plus the initial token.
    } // if

    return 0;  // not a comment token.
} // parse_comment_token


static int parse_end_token(Context *ctx)
{
    if (SWAP32(*(ctx->tokens)) != 0x0000FFFF)   // end token always 0x0000FFFF.
        return 0;  // not us, eat no tokens.

    if (!ctx->know_shader_size)  // this is the end of stream!
        ctx->tokencount = 1;
    else if (ctx->tokencount != 1)  // we _must_ be last. If not: fail.
        fail(ctx, "end token before end of stream");

    if (!isfail(ctx))
        ctx->profile->end_emitter(ctx);

    return 1;
} // parse_end_token


static int parse_phase_token(Context *ctx)
{
    // !!! FIXME: needs state; allow only one phase token per shader, I think?
    if (SWAP32(*(ctx->tokens)) != 0x0000FFFD) // phase token always 0x0000FFFD.
        return 0;  // not us, eat no tokens.

    if ( (!shader_is_pixel(ctx)) || (!shader_version_exactly(ctx, 1, 4)) )
        fail(ctx, "phase token only available in 1.4 pixel shaders");

    if (!isfail(ctx))
        ctx->profile->phase_emitter(ctx);

    return 1;
} // parse_phase_token


static int parse_token(Context *ctx)
{
    int rc = 0;

    assert(ctx->output_stack_len == 0);

    if (ctx->tokencount == 0)
        fail(ctx, "unexpected end of shader.");

    else if ((rc = parse_comment_token(ctx)) != 0)
        return rc;

    else if ((rc = parse_end_token(ctx)) != 0)
        return rc;

    else if ((rc = parse_phase_token(ctx)) != 0)
        return rc;

    else if ((rc = parse_instruction_token(ctx)) != 0)
        return rc;

    failf(ctx, "unknown token (0x%x)", (uint) *ctx->tokens);
    return 1;  // good luck!
} // parse_token


static int find_profile_id(const char *profile)
{
    size_t i;
    for (i = 0; i < STATICARRAYLEN(profileMap); i++)
    {
        const char *name = profileMap[i].from;
        if (strcmp(name, profile) == 0)
        {
            profile = profileMap[i].to;
            break;
        } // if
    } // for

    for (i = 0; i < STATICARRAYLEN(profiles); i++)
    {
        const char *name = profiles[i].name;
        if (strcmp(name, profile) == 0)
            return i;
    } // for

    return -1;  // no match.
} // find_profile_id


static Context *build_context(const char *profile,
                              const char *mainfn,
                              const unsigned char *tokenbuf,
                              const unsigned int bufsize,
                              const MOJOSHADER_swizzle *swiz,
                              const unsigned int swizcount,
                              const MOJOSHADER_samplerMap *smap,
                              const unsigned int smapcount,
                              MOJOSHADER_malloc m, MOJOSHADER_free f, void *d)
{
    if (m == NULL) m = MOJOSHADER_internal_malloc;
    if (f == NULL) f = MOJOSHADER_internal_free;

    Context *ctx = (Context *) m(sizeof (Context), d);
    if (ctx == NULL)
        return NULL;

    memset(ctx, '\0', sizeof (Context));
    ctx->malloc = m;
    ctx->free = f;
    ctx->malloc_data = d;
    ctx->tokens = (const uint32 *) tokenbuf;
    ctx->orig_tokens = (const uint32 *) tokenbuf;
    ctx->know_shader_size = (bufsize != 0);
    ctx->tokencount = ctx->know_shader_size ? (bufsize / sizeof (uint32)) : 0xFFFFFFFF;
    ctx->swizzles = swiz;
    ctx->swizzles_count = swizcount;
    ctx->samplermap = smap;
    ctx->samplermap_count = smapcount;
    ctx->endline = ENDLINE_STR;
    ctx->endline_len = strlen(ctx->endline);
    ctx->last_address_reg_component = -1;
    ctx->current_position = MOJOSHADER_POSITION_BEFORE;
    ctx->texm3x2pad_dst0 = -1;
    ctx->texm3x2pad_src0 = -1;
    ctx->texm3x3pad_dst0 = -1;
    ctx->texm3x3pad_src0 = -1;
    ctx->texm3x3pad_dst1 = -1;
    ctx->texm3x3pad_src1 = -1;

    ctx->errors = errorlist_create(MallocBridge, FreeBridge, ctx);
    if (ctx->errors == NULL)
    {
        f((void *) ctx->mainfn, d);
        f(ctx, d);
        return NULL;
    } // if

    if (!set_output(ctx, &ctx->mainline))
    {
        errorlist_destroy(ctx->errors);
        f((void *) ctx->mainfn, d);
        f(ctx, d);
        return NULL;
    } // if

    if (mainfn)
        ctx->mainfn = StrDup(ctx, mainfn);

    const int profileid = find_profile_id(profile);
    ctx->profileid = profileid;
    if (profileid >= 0)
        ctx->profile = &profiles[profileid];
    else
        failf(ctx, "Profile '%s' is unknown or unsupported", profile);

    return ctx;
} // build_context


static void free_constants_list(MOJOSHADER_free f, void *d, ConstantsList *item)
{
    while (item != NULL)
    {
        ConstantsList *next = item->next;
        f(item, d);
        item = next;
    } // while
} // free_constants_list


static void free_variable_list(MOJOSHADER_free f, void *d, VariableList *item)
{
    while (item != NULL)
    {
        VariableList *next = item->next;
        f(item, d);
        item = next;
    } // while
} // free_variable_list


static void free_sym_typeinfo(MOJOSHADER_free f, void *d,
                              MOJOSHADER_symbolTypeInfo *typeinfo)
{
    int i;
    for (i = 0; i < typeinfo->member_count; i++)
    {
        f((void *) typeinfo->members[i].name, d);
        free_sym_typeinfo(f, d, &typeinfo->members[i].info);
    } // for
    f((void *) typeinfo->members, d);
} // free_sym_members


static void free_symbols(MOJOSHADER_free f, void *d, MOJOSHADER_symbol *syms,
                         const int symcount)
{
    int i;
    for (i = 0; i < symcount; i++)
    {
        f((void *) syms[i].name, d);
        free_sym_typeinfo(f, d, &syms[i].info);
    } // for
    f((void *) syms, d);
} // free_symbols


static void destroy_context(Context *ctx)
{
    if (ctx != NULL)
    {
        MOJOSHADER_free f = ((ctx->free != NULL) ? ctx->free : MOJOSHADER_internal_free);
        void *d = ctx->malloc_data;
        buffer_destroy(ctx->preflight);
        buffer_destroy(ctx->globals);
        buffer_destroy(ctx->inputs);
        buffer_destroy(ctx->outputs);
        buffer_destroy(ctx->helpers);
        buffer_destroy(ctx->subroutines);
        buffer_destroy(ctx->mainline_intro);
        buffer_destroy(ctx->mainline_arguments);
        buffer_destroy(ctx->mainline_top);
        buffer_destroy(ctx->mainline);
        buffer_destroy(ctx->postflight);
        buffer_destroy(ctx->ignore);
        free_constants_list(f, d, ctx->constants);
        free_reglist(f, d, ctx->used_registers.next);
        free_reglist(f, d, ctx->defined_registers.next);
        free_reglist(f, d, ctx->uniforms.next);
        free_reglist(f, d, ctx->attributes.next);
        free_reglist(f, d, ctx->samplers.next);
        free_variable_list(f, d, ctx->variables);
        errorlist_destroy(ctx->errors);
        free_symbols(f, d, ctx->ctab.symbols, ctx->ctab.symbol_count);
        MOJOSHADER_freePreshader(ctx->preshader, f, d);
        f((void *) ctx->mainfn, d);
        f(ctx, d);
    } // if
} // destroy_context


static char *build_output(Context *ctx, size_t *len)
{
    // add a byte for a null terminator.
    Buffer *buffers[] = {
        ctx->preflight, ctx->globals, ctx->inputs, ctx->outputs, ctx->helpers,
        ctx->subroutines, ctx->mainline_intro, ctx->mainline_arguments,
        ctx->mainline_top, ctx->mainline, ctx->postflight
        // don't append ctx->ignore ... that's why it's called "ignore"
    };
    char *retval = buffer_merge(buffers, STATICARRAYLEN(buffers), len);
    return retval;
} // build_output


static inline const char *alloc_varname(Context *ctx, const RegisterList *reg)
{
    return ctx->profile->get_varname(ctx, reg->regtype, reg->regnum);
} // alloc_varname


// !!! FIXME: this code is sort of hard to follow:
// !!! FIXME:  "var->used" only applies to arrays (at the moment, at least,
// !!! FIXME:  but this might be buggy at a later time?), and this code
// !!! FIXME:  relies on that.
// !!! FIXME: "variables" means "things we found in a CTAB" but it's not
// !!! FIXME:  all registers, etc.
// !!! FIXME: "const_array" means an array for d3d "const" registers (c0, c1,
// !!! FIXME:  etc), but not a constant array, although they _can_ be.
// !!! FIXME: It's just a mess.  :/
static MOJOSHADER_uniform *build_uniforms(Context *ctx)
{
    const size_t len = sizeof (MOJOSHADER_uniform) * ctx->uniform_count;
    MOJOSHADER_uniform *retval = (MOJOSHADER_uniform *) Malloc(ctx, len);

    if (retval != NULL)
    {
        MOJOSHADER_uniform *wptr = retval;
        memset(wptr, '\0', len);

        VariableList *var;
        int written = 0;
        for (var = ctx->variables; var != NULL; var = var->next)
        {
            if (var->used)
            {
                const char *name = ctx->profile->get_const_array_varname(ctx,
                                                      var->index, var->count);
                if (name != NULL)
                {
                    wptr->type = MOJOSHADER_UNIFORM_FLOAT;
                    wptr->index = var->index;
                    wptr->array_count = var->count;
                    wptr->constant = (var->constant != NULL) ? 1 : 0;
                    wptr->name = name;
                    wptr++;
                    written++;
                } // if
            } // if
        } // for

        RegisterList *item = ctx->uniforms.next;
        MOJOSHADER_uniformType type = MOJOSHADER_UNIFORM_FLOAT;
        while (written < ctx->uniform_count)
        {
            int skip = 0;

            // !!! FIXME: does this fail if written > ctx->uniform_count?
            if (item == NULL)
            {
                fail(ctx, "BUG: mismatched uniform list and count");
                break;
            } // if

            int index = item->regnum;
            switch (item->regtype)
            {
                case REG_TYPE_CONST:
                    skip = (item->array != NULL);
                    type = MOJOSHADER_UNIFORM_FLOAT;
                    break;

                case REG_TYPE_CONSTINT:
                    type = MOJOSHADER_UNIFORM_INT;
                    break;

                case REG_TYPE_CONSTBOOL:
                    type = MOJOSHADER_UNIFORM_BOOL;
                    break;

                default:
                    fail(ctx, "unknown uniform datatype");
                    break;
            } // switch

            if (!skip)
            {
                wptr->type = type;
                wptr->index = index;
                wptr->array_count = 0;
                wptr->name = alloc_varname(ctx, item);
                wptr++;
                written++;
            } // if

            item = item->next;
        } // for
    } // if

    return retval;
} // build_uniforms


static MOJOSHADER_constant *build_constants(Context *ctx)
{
    const size_t len = sizeof (MOJOSHADER_constant) * ctx->constant_count;
    MOJOSHADER_constant *retval = (MOJOSHADER_constant *) Malloc(ctx, len);

    if (retval != NULL)
    {
        ConstantsList *item = ctx->constants;
        int i;

        for (i = 0; i < ctx->constant_count; i++)
        {
            if (item == NULL)
            {
                fail(ctx, "BUG: mismatched constant list and count");
                break;
            } // if

            memcpy(&retval[i], &item->constant, sizeof (MOJOSHADER_constant));
            item = item->next;
        } // for
    } // if

    return retval;
} // build_constants


static MOJOSHADER_sampler *build_samplers(Context *ctx)
{
    const size_t len = sizeof (MOJOSHADER_sampler) * ctx->sampler_count;
    MOJOSHADER_sampler *retval = (MOJOSHADER_sampler *) Malloc(ctx, len);

    if (retval != NULL)
    {
        RegisterList *item = ctx->samplers.next;
        int i;

        memset(retval, '\0', len);

        for (i = 0; i < ctx->sampler_count; i++)
        {
            if (item == NULL)
            {
                fail(ctx, "BUG: mismatched sampler list and count");
                break;
            } // if

            assert(item->regtype == REG_TYPE_SAMPLER);
            retval[i].type = cvtD3DToMojoSamplerType((TextureType) item->index);
            retval[i].index = item->regnum;
            retval[i].name = alloc_varname(ctx, item);
            retval[i].texbem = (item->misc != 0) ? 1 : 0;
            item = item->next;
        } // for
    } // if

    return retval;
} // build_samplers


static MOJOSHADER_attribute *build_attributes(Context *ctx, int *_count)
{
    int count = 0;

    if (ctx->attribute_count == 0)
    {
        *_count = 0;
        return NULL;  // nothing to do.
    } // if

    const size_t len = sizeof (MOJOSHADER_attribute) * ctx->attribute_count;
    MOJOSHADER_attribute *retval = (MOJOSHADER_attribute *) Malloc(ctx, len);

    if (retval != NULL)
    {
        RegisterList *item = ctx->attributes.next;
        MOJOSHADER_attribute *wptr = retval;
        int ignore = 0;
        int i;

        memset(retval, '\0', len);

        for (i = 0; i < ctx->attribute_count; i++)
        {
            if (item == NULL)
            {
                fail(ctx, "BUG: mismatched attribute list and count");
                break;
            } // if

            switch (item->regtype)
            {
                case REG_TYPE_RASTOUT:
                case REG_TYPE_ATTROUT:
                case REG_TYPE_TEXCRDOUT:
                case REG_TYPE_COLOROUT:
                case REG_TYPE_DEPTHOUT:
                    ignore = 1;
                    break;
                case REG_TYPE_TEXTURE:
                case REG_TYPE_MISCTYPE:
                case REG_TYPE_INPUT:
                    ignore = shader_is_pixel(ctx);
                    break;
                default:
                    ignore = 0;
                    break;
            } // switch

            if (!ignore)
            {
                if (shader_is_pixel(ctx))
                    fail(ctx, "BUG: pixel shader with vertex attributes");
                else
                {
                    wptr->usage = item->usage;
                    wptr->index = item->index;
                    wptr->name = alloc_varname(ctx, item);
                    wptr++;
                    count++;
                } // else
            } // if

            item = item->next;
        } // for
    } // if

    *_count = count;
    return retval;
} // build_attributes

static MOJOSHADER_attribute *build_outputs(Context *ctx, int *_count)
{
    int count = 0;

    if (ctx->attribute_count == 0)
    {
        *_count = 0;
        return NULL;  // nothing to do.
    } // if

    const size_t len = sizeof (MOJOSHADER_attribute) * ctx->attribute_count;
    MOJOSHADER_attribute *retval = (MOJOSHADER_attribute *) Malloc(ctx, len);

    if (retval != NULL)
    {
        RegisterList *item = ctx->attributes.next;
        MOJOSHADER_attribute *wptr = retval;
        int i;

        memset(retval, '\0', len);

        for (i = 0; i < ctx->attribute_count; i++)
        {
            if (item == NULL)
            {
                fail(ctx, "BUG: mismatched attribute list and count");
                break;
            } // if

            switch (item->regtype)
            {
                case REG_TYPE_RASTOUT:
                case REG_TYPE_ATTROUT:
                case REG_TYPE_TEXCRDOUT:
                case REG_TYPE_COLOROUT:
                case REG_TYPE_DEPTHOUT:
                    wptr->usage = item->usage;
                    wptr->index = item->index;
                    wptr->name = alloc_varname(ctx, item);
                    wptr++;
                    count++;
                    break;
                default:
                    break;
            } // switch


            item = item->next;
        } // for
    } // if

    *_count = count;
    return retval;
} // build_outputs


static MOJOSHADER_parseData *build_parsedata(Context *ctx)
{
    char *output = NULL;
    MOJOSHADER_constant *constants = NULL;
    MOJOSHADER_uniform *uniforms = NULL;
    MOJOSHADER_attribute *attributes = NULL;
    MOJOSHADER_attribute *outputs = NULL;
    MOJOSHADER_sampler *samplers = NULL;
    MOJOSHADER_swizzle *swizzles = NULL;
    MOJOSHADER_error *errors = NULL;
    MOJOSHADER_parseData *retval = NULL;
    size_t output_len = 0;
    int attribute_count = 0;
    int output_count = 0;

    if (ctx->out_of_memory)
        return &MOJOSHADER_out_of_mem_data;

    retval = (MOJOSHADER_parseData*) Malloc(ctx, sizeof(MOJOSHADER_parseData));
    if (retval == NULL)
        return &MOJOSHADER_out_of_mem_data;

    memset(retval, '\0', sizeof (MOJOSHADER_parseData));

    if (!isfail(ctx))
        output = build_output(ctx, &output_len);

    if (!isfail(ctx))
        constants = build_constants(ctx);

    if (!isfail(ctx))
        uniforms = build_uniforms(ctx);

    if (!isfail(ctx))
        attributes = build_attributes(ctx, &attribute_count);

    if (!isfail(ctx))
        outputs = build_outputs(ctx, &output_count);

    if (!isfail(ctx))
        samplers = build_samplers(ctx);

    const int error_count = errorlist_count(ctx->errors);
    errors = errorlist_flatten(ctx->errors);

    if (!isfail(ctx))
    {
        if (ctx->swizzles_count > 0)
        {
            const int len = ctx->swizzles_count * sizeof (MOJOSHADER_swizzle);
            swizzles = (MOJOSHADER_swizzle *) Malloc(ctx, len);
            if (swizzles != NULL)
                memcpy(swizzles, ctx->swizzles, len);
        } // if
    } // if

    // check again, in case build_output, etc, ran out of memory.
    if (isfail(ctx))
    {
        int i;

        Free(ctx, output);
        Free(ctx, constants);
        Free(ctx, swizzles);

        if (uniforms != NULL)
        {
            for (i = 0; i < ctx->uniform_count; i++)
                Free(ctx, (void *) uniforms[i].name);
            Free(ctx, uniforms);
        } // if

        if (attributes != NULL)
        {
            for (i = 0; i < attribute_count; i++)
                Free(ctx, (void *) attributes[i].name);
            Free(ctx, attributes);
        } // if

        if (outputs != NULL)
        {
            for (i = 0; i < output_count; i++)
                Free(ctx, (void *) outputs[i].name);
            Free(ctx, outputs);
        } // if

        if (samplers != NULL)
        {
            for (i = 0; i < ctx->sampler_count; i++)
                Free(ctx, (void *) samplers[i].name);
            Free(ctx, samplers);
        } // if

        if (ctx->out_of_memory)
        {
            for (i = 0; i < error_count; i++)
            {
                Free(ctx, (void *) errors[i].filename);
                Free(ctx, (void *) errors[i].error);
            } // for
            Free(ctx, errors);
            Free(ctx, retval);
            return &MOJOSHADER_out_of_mem_data;
        } // if
    } // if
    else
    {
        retval->profile = ctx->profile->name;
        retval->output = output;
        retval->output_len = (int) output_len;
        retval->instruction_count = ctx->instruction_count;
        retval->shader_type = ctx->shader_type;
        retval->major_ver = (int) ctx->major_ver;
        retval->minor_ver = (int) ctx->minor_ver;
        retval->uniform_count = ctx->uniform_count;
        retval->uniforms = uniforms;
        retval->constant_count = ctx->constant_count;
        retval->constants = constants;
        retval->sampler_count = ctx->sampler_count;
        retval->samplers = samplers;
        retval->attribute_count = attribute_count;
        retval->attributes = attributes;
        retval->output_count = output_count;
        retval->outputs = outputs;
        retval->swizzle_count = ctx->swizzles_count;
        retval->swizzles = swizzles;
        retval->symbol_count = ctx->ctab.symbol_count;
        retval->symbols = ctx->ctab.symbols;
        retval->preshader = ctx->preshader;
        retval->mainfn = ctx->mainfn;

        // we don't own these now, retval does.
        ctx->ctab.symbols = NULL;
        ctx->preshader = NULL;
        ctx->ctab.symbol_count = 0;
        ctx->mainfn = NULL;
    } // else

    retval->error_count = error_count;
    retval->errors = errors;
    retval->malloc = (ctx->malloc == MOJOSHADER_internal_malloc) ? NULL : ctx->malloc;
    retval->free = (ctx->free == MOJOSHADER_internal_free) ? NULL : ctx->free;
    retval->malloc_data = ctx->malloc_data;

    return retval;
} // build_parsedata


static void process_definitions(Context *ctx)
{
    // !!! FIXME: apparently, pre ps_3_0, sampler registers don't need to be
    // !!! FIXME:  DCL'd before use (default to 2d?). We aren't checking
    // !!! FIXME:  this at the moment, though.

    determine_constants_arrays(ctx);  // in case this hasn't been called yet.

    RegisterList *uitem = &ctx->uniforms;
    RegisterList *prev = &ctx->used_registers;
    RegisterList *item = prev->next;

    while (item != NULL)
    {
        RegisterList *next = item->next;
        const RegisterType regtype = item->regtype;
        const int regnum = item->regnum;

        if (!get_defined_register(ctx, regtype, regnum))
        {
            // haven't already dealt with this one.
            switch (regtype)
            {
                // !!! FIXME: I'm not entirely sure this is right...
                case REG_TYPE_RASTOUT:
                case REG_TYPE_ATTROUT:
                case REG_TYPE_TEXCRDOUT:
                case REG_TYPE_COLOROUT:
                case REG_TYPE_DEPTHOUT:
                    if (shader_is_vertex(ctx)&&shader_version_atleast(ctx,3,0))
                    {
                        fail(ctx, "vs_3 can't use output registers"
                                  " without declaring them first.");
                        return;
                    } // if

                    // Apparently this is an attribute that wasn't DCL'd.
                    //  Add it to the attribute list; deal with it later.
                    // !!! FIXME: we should use something other than UNKNOWN here.
                    add_attribute_register(ctx, regtype, regnum,
                                           MOJOSHADER_USAGE_UNKNOWN, 0, 0xF, 0);
                    break;

                case REG_TYPE_ADDRESS:
                case REG_TYPE_PREDICATE:
                case REG_TYPE_TEMP:
                case REG_TYPE_LOOP:
                case REG_TYPE_LABEL:
                    ctx->profile->global_emitter(ctx, regtype, regnum);
                    break;

                case REG_TYPE_CONST:
                case REG_TYPE_CONSTINT:
                case REG_TYPE_CONSTBOOL:
                    // separate uniforms into a different list for now.
                    prev->next = next;
                    item->next = NULL;
                    uitem->next = item;
                    uitem = item;
                    item = prev;
                    break;

                case REG_TYPE_INPUT:
                    // You don't have to dcl_ your inputs in Shader Model 1.
                    if (shader_is_pixel(ctx)&&!shader_version_atleast(ctx,2,0))
                    {
                        add_attribute_register(ctx, regtype, regnum,
                                               MOJOSHADER_USAGE_COLOR, regnum,
                                               0xF, 0);
                        break;
                    } // if
                    // fall through...

                default:
                    fail(ctx, "BUG: we used a register we don't know how to define.");
            } // switch
        } // if

        prev = item;
        item = next;
    } // while

    // okay, now deal with uniform/constant arrays...
    VariableList *var;
    for (var = ctx->variables; var != NULL; var = var->next)
    {
        if (var->used)
        {
            if (var->constant)
            {
                ctx->profile->const_array_emitter(ctx, var->constant,
                                                  var->index, var->count);
            } // if
            else
            {
                ctx->profile->array_emitter(ctx, var);
                ctx->uniform_float4_count += var->count;
                ctx->uniform_count++;
            } // else
        } // if
    } // for

    // ...and uniforms...
    for (item = ctx->uniforms.next; item != NULL; item = item->next)
    {
        int arraysize = -1;

        // check if this is a register contained in an array...
        if (item->regtype == REG_TYPE_CONST)
        {
            for (var = ctx->variables; var != NULL; var = var->next)
            {
                if (!var->used)
                    continue;

                const int regnum = item->regnum;
                const int lo = var->index;
                if ( (regnum >= lo) && (regnum < (lo + var->count)) )
                {
                    assert(!var->constant);
                    item->array = var;  // used when building parseData.
                    arraysize = var->count;
                    break;
                } // if
            } // for
        } // if

        ctx->profile->uniform_emitter(ctx, item->regtype, item->regnum, var);

        if (arraysize < 0)  // not part of an array?
        {
            ctx->uniform_count++;
            switch (item->regtype)
            {
                case REG_TYPE_CONST: ctx->uniform_float4_count++; break;
                case REG_TYPE_CONSTINT: ctx->uniform_int4_count++; break;
                case REG_TYPE_CONSTBOOL: ctx->uniform_bool_count++; break;
                default: break;
            } // switch
        } // if
    } // for

    // ...and samplers...
    for (item = ctx->samplers.next; item != NULL; item = item->next)
    {
        ctx->sampler_count++;
        ctx->profile->sampler_emitter(ctx, item->regnum,
                                      (TextureType) item->index,
                                      item->misc != 0);
    } // for

    // ...and attributes...
    for (item = ctx->attributes.next; item != NULL; item = item->next)
    {
        ctx->attribute_count++;
        ctx->profile->attribute_emitter(ctx, item->regtype, item->regnum,
                                        item->usage, item->index,
                                        item->writemask, item->misc);
    } // for
} // process_definitions


static void verify_swizzles(Context *ctx)
{
    size_t i;
    const char *failmsg = "invalid swizzle";
    for (i = 0; i < ctx->swizzles_count; i++)
    {
        const MOJOSHADER_swizzle *swiz = &ctx->swizzles[i];
        if (swiz->swizzles[0] > 3) { fail(ctx, failmsg); return; }
        if (swiz->swizzles[1] > 3) { fail(ctx, failmsg); return; }
        if (swiz->swizzles[2] > 3) { fail(ctx, failmsg); return; }
        if (swiz->swizzles[3] > 3) { fail(ctx, failmsg); return; }
    } // for
} // verify_swizzles


// API entry point...

// !!! FIXME:
// MSDN: "Shader validation will fail CreatePixelShader on any shader that
//  attempts to read from a temporary register that has not been written by a
//  previous instruction."  (true for ps_1_*, maybe others). Check this.

const MOJOSHADER_parseData *MOJOSHADER_parse(const char *profile,
                                             const char *mainfn,
                                             const unsigned char *tokenbuf,
                                             const unsigned int bufsize,
                                             const MOJOSHADER_swizzle *swiz,
                                             const unsigned int swizcount,
                                             const MOJOSHADER_samplerMap *smap,
                                             const unsigned int smapcount,
                                             MOJOSHADER_malloc m,
                                             MOJOSHADER_free f, void *d)
{
    MOJOSHADER_parseData *retval = NULL;
    Context *ctx = NULL;
    int rc = 0;
    int failed = 0;

    if ( ((m == NULL) && (f != NULL)) || ((m != NULL) && (f == NULL)) )
        return &MOJOSHADER_out_of_mem_data;  // supply both or neither.

    ctx = build_context(profile, mainfn, tokenbuf, bufsize, swiz, swizcount,
                        smap, smapcount, m, f, d);
    if (ctx == NULL)
        return &MOJOSHADER_out_of_mem_data;
	
    if (isfail(ctx))
    {
        retval = build_parsedata(ctx);
        destroy_context(ctx);
        return retval;
    } // if

    verify_swizzles(ctx);

    // Version token always comes first.
    ctx->current_position = 0;
    rc = parse_version_token(ctx, profile);

    if (!ctx->mainfn)
        ctx->mainfn = StrDup(ctx, "main");

    // drop out now if this definitely isn't bytecode. Saves lots of
    //  meaningless errors flooding through.
    if (rc < 0)
    {
        retval = build_parsedata(ctx);
        destroy_context(ctx);
        return retval;
    } // if

    if ( ((uint32) rc) > ctx->tokencount )
    {
        fail(ctx, "Corrupted or truncated shader");
        ctx->tokencount = rc;
    } // if

    adjust_token_position(ctx, rc);

    // parse out the rest of the tokens after the version token...
    while (ctx->tokencount > 0)
    {
        if (!ctx->know_shader_size)
            ctx->tokencount = 0xFFFFFFFF;  // keep this value obscenely large.

        // reset for each token.
        if (isfail(ctx))
        {
            failed = 1;
            ctx->isfail = 0;
        } // if

        rc = parse_token(ctx);
        if ( ((uint32) rc) > ctx->tokencount )
        {
            fail(ctx, "Corrupted or truncated shader");
            break;
        } // if

        adjust_token_position(ctx, rc);
    } // while

    ctx->current_position = MOJOSHADER_POSITION_AFTER;

    // for ps_1_*, the output color is written to r0...throw an
    //  error if this register was never written. This isn't
    //  important for vertex shaders, or shader model 2+.
    if (shader_is_pixel(ctx) && !shader_version_atleast(ctx, 2, 0))
    {
        if (!register_was_written(ctx, REG_TYPE_TEMP, 0))
            fail(ctx, "r0 (pixel shader 1.x color output) never written to");
    } // if

    if (!failed)
    {
        process_definitions(ctx);
        failed = isfail(ctx);
    } // if

    if (!failed)
        ctx->profile->finalize_emitter(ctx);

    ctx->isfail = failed;
    retval = build_parsedata(ctx);
    destroy_context(ctx);
    return retval;
} // MOJOSHADER_parse


void MOJOSHADER_freeParseData(const MOJOSHADER_parseData *_data)
{
    MOJOSHADER_parseData *data = (MOJOSHADER_parseData *) _data;
    if ((data == NULL) || (data == &MOJOSHADER_out_of_mem_data))
        return;  // no-op.

    MOJOSHADER_free f = (data->free == NULL) ? MOJOSHADER_internal_free : data->free;
    void *d = data->malloc_data;
    int i;

    // we don't f(data->profile), because that's internal static data.

    f((void *) data->mainfn, d);
    f((void *) data->output, d);
    f((void *) data->constants, d);
    f((void *) data->swizzles, d);

    for (i = 0; i < data->error_count; i++)
    {
        f((void *) data->errors[i].error, d);
        f((void *) data->errors[i].filename, d);
    } // for
    f((void *) data->errors, d);

    for (i = 0; i < data->uniform_count; i++)
        f((void *) data->uniforms[i].name, d);
    f((void *) data->uniforms, d);

    for (i = 0; i < data->attribute_count; i++)
        f((void *) data->attributes[i].name, d);
    f((void *) data->attributes, d);

    for (i = 0; i < data->output_count; i++)
        f((void *) data->outputs[i].name, d);
    f((void *) data->outputs, d);

    for (i = 0; i < data->sampler_count; i++)
        f((void *) data->samplers[i].name, d);
    f((void *) data->samplers, d);

    free_symbols(f, d, data->symbols, data->symbol_count);
    MOJOSHADER_freePreshader(data->preshader, f, d);

    f(data, d);
} // MOJOSHADER_freeParseData


int MOJOSHADER_version(void)
{
    return MOJOSHADER_VERSION;
} // MOJOSHADER_version


const char *MOJOSHADER_changeset(void)
{
    return MOJOSHADER_CHANGESET;
} // MOJOSHADER_changeset


int MOJOSHADER_maxShaderModel(const char *profile)
{
    #define PROFILE_SHADER_MODEL(p,v) if (strcmp(profile, p) == 0) return v;
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_D3D, 3);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_BYTECODE, 3);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_GLSL, 3);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_GLSL120, 3);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_GLSLES, 3);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_ARB1, 2);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_NV2, 2);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_NV3, 2);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_NV4, 3);
    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_METAL, 3);
    #undef PROFILE_SHADER_MODEL
    return -1;  // unknown profile?
} // MOJOSHADER_maxShaderModel


const MOJOSHADER_preshader *MOJOSHADER_parsePreshader(const unsigned char *buf,
                                                      const unsigned int _len,
                                                      MOJOSHADER_malloc m,
                                                      MOJOSHADER_free f,
                                                      void *d)
{
    // !!! FIXME: This is copypasta ripped from parse_preshader -flibit
    int i;

    const uint32 *tokens = (const uint32 *) buf;
    uint32 tokcount = _len / 4;

    // !!! FIXME: This is stupid. -flibit
    Context fillerContext;
    fillerContext.malloc = m;
    fillerContext.free = f;
    fillerContext.malloc_data = d;
    fillerContext.out_of_memory = 0;

    MOJOSHADER_preshader *preshader = NULL;

    const uint32 version_magic = 0x46580000;
    const uint32 min_version = 0x00000200 | version_magic;
    const uint32 max_version = 0x00000201 | version_magic;
    const uint32 version = SWAP32(tokens[0]);
    if (version < min_version || version > max_version)
    {
        // fail because the shader will malfunction w/o this.
        goto parsePreshader_notAPreshader;
    } // if

    tokens++;
    tokcount++;

    // All sections of a preshader are packed into separate comment tokens,
    //  inside the containing comment token block. Find them all before
    //  we start, so we don't care about the order they appear in the file.
    PreshaderBlockInfo ctab = { 0, 0, 0 };
    PreshaderBlockInfo prsi = { 0, 0, 0 };
    PreshaderBlockInfo fxlc = { 0, 0, 0 };
    PreshaderBlockInfo clit = { 0, 0, 0 };

    while (tokcount > 0)
    {
        uint32 subtokcount = 0;
        // !!! FIXME: Passing a NULL ctx! -flibit
        if ( (!is_comment_token(&fillerContext, *tokens, &subtokcount)) ||
             (subtokcount > tokcount) )
        {
            goto parsePreshader_notAPreshader;
        } // if

        tokens++;
        tokcount--;

        const uint32 *nexttokens = tokens + subtokcount;
        const uint32 nexttokcount = tokcount - subtokcount;

        if (subtokcount > 0)
        {
            switch (SWAP32(*tokens))
            {
                #define PRESHADER_BLOCK_CASE(id, var) \
                    case id##_ID: { \
                        if (var.seen) \
                            goto parsePreshader_notAPreshader; \
                        var.tokens = tokens; \
                        var.tokcount = subtokcount; \
                        var.seen = 1; \
                        break; \
                    }
                PRESHADER_BLOCK_CASE(CTAB, ctab);
                PRESHADER_BLOCK_CASE(PRSI, prsi);
                PRESHADER_BLOCK_CASE(FXLC, fxlc);
                PRESHADER_BLOCK_CASE(CLIT, clit);
                default:
                    // Bogus preshader section
                    goto parsePreshader_notAPreshader;
                #undef PRESHADER_BLOCK_CASE
            } // switch
        } // if

        tokens = nexttokens;
        tokcount = nexttokcount;
    } // while

    if (!ctab.seen) // No CTAB block in preshader
        goto parsePreshader_notAPreshader;
    if (!fxlc.seen) // No FXLC block in preshader
        goto parsePreshader_notAPreshader;
    if (!clit.seen) // No CLIT block in preshader
        goto parsePreshader_notAPreshader;

    preshader = (MOJOSHADER_preshader *) m(sizeof (MOJOSHADER_preshader), d);
    if (preshader == NULL)
        goto parsePreshader_outOfMemory;
    memset(preshader, '\0', sizeof (MOJOSHADER_preshader));

    // Let's set up the constant literals first...
    if (clit.tokcount == 0) // Bogus CLIT block in preshader
        goto parsePreshader_notAPreshader;
    else
    {
        const uint32 lit_count = SWAP32(clit.tokens[1]);
        if (lit_count > ((clit.tokcount - 2) / 2))
            // Bogus CLIT block in preshader
            goto parsePreshader_notAPreshader;
        else if (lit_count > 0)
        {
            preshader->literal_count = (unsigned int) lit_count;
            assert(sizeof (double) == 8);  // just in case.
            const size_t len = sizeof (double) * lit_count;
            preshader->literals = (double *) m(len, d);
            if (preshader->literals == NULL)
                goto parsePreshader_notAPreshader;  // oh well.
            const double *litptr = (const double *) (clit.tokens + 2);
            for (i = 0; i < lit_count; i++)
                preshader->literals[i] = SWAPDBL(litptr[i]);
        } // else if
    } // else

    // Parse out the PRSI block. This is used to map the output registers.
    uint32 output_map_count = 0;
    const uint32 *output_map = NULL;
    if (prsi.seen)
    {
        if (prsi.tokcount < 8) // Bogus preshader PRSI data
            goto parsePreshader_notAPreshader;

        //const uint32 first_output_reg = SWAP32(prsi.tokens[1]);
        // !!! FIXME: there are a lot of fields here I don't know about.
        // !!! FIXME:  maybe [2] and [3] are for int4 and bool registers?
        //const uint32 output_reg_count = SWAP32(prsi.tokens[4]);
        // !!! FIXME:  maybe [5] and [6] are for int4 and bool registers?
        output_map_count = SWAP32(prsi.tokens[7]);

        prsi.tokcount -= 8;
        prsi.tokens += 8;

        if (prsi.tokcount < ((output_map_count + 1) * 2))
            // Bogus preshader PRSI data
            goto parsePreshader_notAPreshader;

        output_map = prsi.tokens;
    }

    // Now we'll figure out the CTAB...
    CtabData ctabdata = { 0, 0, 0 };
    parse_constant_table(&fillerContext, ctab.tokens - 1, ctab.tokcount * 4,
                         version, 0, &ctabdata);

    // preshader owns this now. Don't free it in this function.
    preshader->symbol_count = ctabdata.symbol_count;
    preshader->symbols = ctabdata.symbols;

    if (!ctabdata.have_ctab) // Bogus preshader CTAB data
        goto parsePreshader_notAPreshader;

    // The FXLC block has the actual instructions...
    uint32 opcode_count = SWAP32(fxlc.tokens[1]);

    size_t len = sizeof (MOJOSHADER_preshaderInstruction) * opcode_count;
    preshader->instruction_count = (unsigned int) opcode_count;
    preshader->instructions = (MOJOSHADER_preshaderInstruction *)
                                m(len, d);
    if (preshader->instructions == NULL)
        goto parsePreshader_outOfMemory;
    memset(preshader->instructions, '\0', len);

    fxlc.tokens += 2;
    fxlc.tokcount -= 2;
    if (opcode_count > (fxlc.tokcount / 2)) // Bogus preshader FXLC block
        goto parsePreshader_notAPreshader;

    MOJOSHADER_preshaderInstruction *inst = preshader->instructions;
    while (opcode_count--)
    {
        const uint32 opcodetok = SWAP32(fxlc.tokens[0]);
        MOJOSHADER_preshaderOpcode opcode = MOJOSHADER_PRESHADEROP_NOP;
        switch ((opcodetok >> 16) & 0xFFFF)
        {
            case 0x1000: opcode = MOJOSHADER_PRESHADEROP_MOV; break;
            case 0x1010: opcode = MOJOSHADER_PRESHADEROP_NEG; break;
            case 0x1030: opcode = MOJOSHADER_PRESHADEROP_RCP; break;
            case 0x1040: opcode = MOJOSHADER_PRESHADEROP_FRC; break;
            case 0x1050: opcode = MOJOSHADER_PRESHADEROP_EXP; break;
            case 0x1060: opcode = MOJOSHADER_PRESHADEROP_LOG; break;
            case 0x1070: opcode = MOJOSHADER_PRESHADEROP_RSQ; break;
            case 0x1080: opcode = MOJOSHADER_PRESHADEROP_SIN; break;
            case 0x1090: opcode = MOJOSHADER_PRESHADEROP_COS; break;
            case 0x10A0: opcode = MOJOSHADER_PRESHADEROP_ASIN; break;
            case 0x10B0: opcode = MOJOSHADER_PRESHADEROP_ACOS; break;
            case 0x10C0: opcode = MOJOSHADER_PRESHADEROP_ATAN; break;
            case 0x2000: opcode = MOJOSHADER_PRESHADEROP_MIN; break;
            case 0x2010: opcode = MOJOSHADER_PRESHADEROP_MAX; break;
            case 0x2020: opcode = MOJOSHADER_PRESHADEROP_LT; break;
            case 0x2030: opcode = MOJOSHADER_PRESHADEROP_GE; break;
            case 0x2040: opcode = MOJOSHADER_PRESHADEROP_ADD; break;
            case 0x2050: opcode = MOJOSHADER_PRESHADEROP_MUL; break;
            case 0x2060: opcode = MOJOSHADER_PRESHADEROP_ATAN2; break;
            case 0x2080: opcode = MOJOSHADER_PRESHADEROP_DIV; break;
            case 0x3000: opcode = MOJOSHADER_PRESHADEROP_CMP; break;
            case 0x3010: opcode = MOJOSHADER_PRESHADEROP_MOVC; break;
            case 0x5000: opcode = MOJOSHADER_PRESHADEROP_DOT; break;
            case 0x5020: opcode = MOJOSHADER_PRESHADEROP_NOISE; break;
            case 0xA000: opcode = MOJOSHADER_PRESHADEROP_MIN_SCALAR; break;
            case 0xA010: opcode = MOJOSHADER_PRESHADEROP_MAX_SCALAR; break;
            case 0xA020: opcode = MOJOSHADER_PRESHADEROP_LT_SCALAR; break;
            case 0xA030: opcode = MOJOSHADER_PRESHADEROP_GE_SCALAR; break;
            case 0xA040: opcode = MOJOSHADER_PRESHADEROP_ADD_SCALAR; break;
            case 0xA050: opcode = MOJOSHADER_PRESHADEROP_MUL_SCALAR; break;
            case 0xA060: opcode = MOJOSHADER_PRESHADEROP_ATAN2_SCALAR; break;
            case 0xA080: opcode = MOJOSHADER_PRESHADEROP_DIV_SCALAR; break;
            case 0xD000: opcode = MOJOSHADER_PRESHADEROP_DOT_SCALAR; break;
            case 0xD020: opcode = MOJOSHADER_PRESHADEROP_NOISE_SCALAR; break;
            default:
                // Unknown preshader opcode
                goto parsePreshader_notAPreshader;
        } // switch

        uint32 operand_count = SWAP32(fxlc.tokens[1]) + 1;  // +1 for dest.

        inst->opcode = opcode;
        inst->element_count = (unsigned int) (opcodetok & 0xFF);
        inst->operand_count = (unsigned int) operand_count;

        fxlc.tokens += 2;
        fxlc.tokcount -= 2;
        if ((operand_count * 3) > fxlc.tokcount)
            // Bogus preshader FXLC block
            goto parsePreshader_notAPreshader;

        MOJOSHADER_preshaderOperand *operand = inst->operands;
        while (operand_count--)
        {
            const unsigned int item = (unsigned int) SWAP32(fxlc.tokens[2]);

            // !!! FIXME: Is this used anywhere other than INPUT? -flibit
            const uint32 numArrays = SWAP32(fxlc.tokens[0]);

            switch (SWAP32(fxlc.tokens[1]))
            {
                case 1:  // literal from CLIT block.
                {
                    if ((item + inst->element_count) >= preshader->literal_count)
                        // Bogus preshader literal index
                        goto parsePreshader_notAPreshader;
                    operand->type = MOJOSHADER_PRESHADEROPERAND_LITERAL;
                    break;
                } // case

                case 2:  // item from ctabdata.
                {
                    MOJOSHADER_symbol *sym = ctabdata.symbols;
                    for (i = 0; i < ctabdata.symbol_count; i++, sym++)
                    {
                        const uint32 base = sym->register_index * 4;
                        const uint32 count = sym->register_count * 4;
                        assert(sym->register_set==MOJOSHADER_SYMREGSET_FLOAT4);
                        if ( (base <= item) && ((base + count) >= (item + inst->element_count)) )
                            break;
                    } // for
                    if (i == ctabdata.symbol_count)
                        // Bogus preshader input index
                        goto parsePreshader_notAPreshader;
                    operand->type = MOJOSHADER_PRESHADEROPERAND_INPUT;
                    if (numArrays > 0)
                    {
                        // malloc the array symbol name array
                        const uint32 siz = numArrays * sizeof (uint32);
                        operand->array_register_count = numArrays;
                        operand->array_registers = (uint32 *) m(siz, d);
                        memset(operand->array_registers, '\0', siz);
                        // Get each register base, indicating the arrays used.
                        for (i = 0; i < numArrays; i++)
                        {
                            const uint32 jmp = SWAP32(fxlc.tokens[4]);
                            const uint32 bigjmp = (jmp >> 4) * 4;
                            const uint32 ltljmp = (jmp >> 2) & 3;
                            operand->array_registers[i] = bigjmp + ltljmp;
                            fxlc.tokens += 2;
                            fxlc.tokcount -= 2;
                        } // for
                    } // if
                    break;
                } // case

                case 4:
                {
                    operand->type = MOJOSHADER_PRESHADEROPERAND_OUTPUT;
                    if (!prsi.seen)
                        // No PRSI tokens, no output map.
                        continue;
                    for (i = 0; i < output_map_count; i++)
                    {
                        const uint32 base = output_map[(i*2)] * 4;
                        const uint32 count = output_map[(i*2)+1] * 4;
                        if ( (base <= item) && ((base + count) >= (item + inst->element_count)) )
                            break;
                    } // for
                    if (i == output_map_count)
                        // Bogus preshader output index
                        goto parsePreshader_notAPreshader;
                    break;
                } // case

                case 7:
                {
                    operand->type = MOJOSHADER_PRESHADEROPERAND_TEMP;
                    if (item >= preshader->temp_count)
                        preshader->temp_count = item + inst->element_count;
                    break;
                } // case
            } // switch

            operand->index = item;

            fxlc.tokens += 3;
            fxlc.tokcount -= 3;
            operand++;
        } // while

        inst++;
    } // while

    // Registers need to be vec4, round up to nearest 4
    preshader->temp_count = (preshader->temp_count + 3) & ~3;

    unsigned int largest = 0;
    const MOJOSHADER_symbol *sym = preshader->symbols;
    for (i = 0; i < preshader->symbol_count; i++, sym++)
    {
        const unsigned int val = sym->register_index + sym->register_count;
        if (val > largest)
            largest = val;
    } // for

    if (largest > 0)
    {
        const size_t len = largest * sizeof (float) * 4;
        preshader->registers = (float *) m(len, d);
        memset(preshader->registers, '\0', len);
        preshader->register_count = largest;
    } // if

    return preshader;

parsePreshader_notAPreshader:
parsePreshader_outOfMemory:
    MOJOSHADER_freePreshader(preshader, f, d);
    return NULL;
} // MOJOSHADER_parsePreshader


void MOJOSHADER_freePreshader(const MOJOSHADER_preshader *preshader,
                              MOJOSHADER_free f,
                              void *d)
{
    int i, j;
    if (preshader != NULL)
    {
        f((void *) preshader->literals, d);
        for (i = 0; i < preshader->instruction_count; i++)
            for (j = 0; j < preshader->instructions[i].operand_count; j++)
                f((void *) preshader->instructions[i].operands[j].array_registers, d);
        f((void *) preshader->instructions, d);
        f((void *) preshader->registers, d);
        free_symbols(f, d, preshader->symbols, preshader->symbol_count);
        f((void *) preshader, d);
    } // if
} // MOJOSHADER_freePreshader

// end of mojoshader.c ...