Implement HLSL emitter, MOJOSHADER_d3d11 API
authorCaleb Cornett <caleb.cornett@outlook.com>
Thu, 21 May 2020 12:00:55 -0400
changeset 1255 0135d797e287
parent 1254 422f68756c9f
child 1256 213b2cb6fd46
Implement HLSL emitter, MOJOSHADER_d3d11 API
CMakeLists.txt
mojoshader.c
mojoshader.h
mojoshader_d3d11.c
mojoshader_internal.h
profiles/mojoshader_profile.h
profiles/mojoshader_profile_glsl.c
profiles/mojoshader_profile_hlsl.c
--- a/CMakeLists.txt	Mon May 04 09:55:32 2020 -0400
+++ b/CMakeLists.txt	Thu May 21 12:00:55 2020 -0400
@@ -4,6 +4,7 @@
 OPTION(BUILD_SHARED_LIBS "Build MojoShader as a shared library" OFF)
 OPTION(PROFILE_D3D "Build MojoShader with support for the D3D profile" ON)
 OPTION(PROFILE_BYTECODE "Build MojoShader with support for the BYTECODE profile" ON)
+OPTION(PROFILE_HLSL "Build MojoShader with support for the HLSL profile" ON)
 OPTION(PROFILE_GLSL120 "Build MojoShader with support for the GLSL120 profile" ON)
 OPTION(PROFILE_GLSLES "Build MojoShader with support for the GLSLES profile" ON)
 OPTION(PROFILE_GLSL "Build MojoShader with support for the GLSL profile" ON)
@@ -108,6 +109,9 @@
 IF(NOT PROFILE_BYTECODE)
     ADD_DEFINITIONS(-DSUPPORT_PROFILE_BYTECODE=0)
 ENDIF(NOT PROFILE_BYTECODE)
+IF(NOT PROFILE_HLSL)
+    ADD_DEFINITIONS(-DSUPPORT_PROFILE_HLSL=0)
+ENDIF(NOT PROFILE_HLSL)
 IF(NOT PROFILE_GLSL120)
     ADD_DEFINITIONS(-DSUPPORT_PROFILE_GLSL120=0)
 ENDIF(NOT PROFILE_GLSL120)
@@ -159,9 +163,11 @@
     mojoshader_common.c
     mojoshader_opengl.c
     mojoshader_metal.c
+    mojoshader_d3d11.c
     profiles/mojoshader_profile_arb1.c
     profiles/mojoshader_profile_bytecode.c
     profiles/mojoshader_profile_d3d.c
+    profiles/mojoshader_profile_hlsl.c
     profiles/mojoshader_profile_glsl.c
     profiles/mojoshader_profile_metal.c
     profiles/mojoshader_profile_spirv.c
--- a/mojoshader.c	Mon May 04 09:55:32 2020 -0400
+++ b/mojoshader.c	Thu May 21 12:00:55 2020 -0400
@@ -236,6 +236,15 @@
 PREDECLARE_PROFILE(D3D)
 #endif
 
+#if !SUPPORT_PROFILE_HLSL
+#define PROFILE_EMITTER_HLSL(op)
+#else
+#undef AT_LEAST_ONE_PROFILE
+#define AT_LEAST_ONE_PROFILE 1
+#define PROFILE_EMITTER_HLSL(op) emit_HLSL_##op,
+PREDECLARE_PROFILE(HLSL)
+#endif
+
 #if !SUPPORT_PROFILE_GLSL
 #define PROFILE_EMITTER_GLSL(op)
 #else
@@ -300,6 +309,9 @@
 #if SUPPORT_PROFILE_BYTECODE
     DEFINE_PROFILE(BYTECODE)
 #endif
+#if SUPPORT_PROFILE_HLSL
+    DEFINE_PROFILE(HLSL)
+#endif
 #if SUPPORT_PROFILE_GLSL
     DEFINE_PROFILE(GLSL)
 #endif
@@ -331,6 +343,7 @@
 #define PROFILE_EMITTERS(op) { \
      PROFILE_EMITTER_D3D(op) \
      PROFILE_EMITTER_BYTECODE(op) \
+     PROFILE_EMITTER_HLSL(op) \
      PROFILE_EMITTER_GLSL(op) \
      PROFILE_EMITTER_ARB1(op) \
      PROFILE_EMITTER_METAL(op) \
@@ -3866,6 +3879,7 @@
     #define PROFILE_SHADER_MODEL(p,v) if (strcmp(profile, p) == 0) return v;
     PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_D3D, 3);
     PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_BYTECODE, 3);
+    PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_HLSL, 3);
     PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_GLSL, 3);
     PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_GLSL120, 3);
     PROFILE_SHADER_MODEL(MOJOSHADER_PROFILE_GLSLES, 3);
--- a/mojoshader.h	Mon May 04 09:55:32 2020 -0400
+++ b/mojoshader.h	Thu May 21 12:00:55 2020 -0400
@@ -686,6 +686,11 @@
 #define MOJOSHADER_PROFILE_BYTECODE "bytecode"
 
 /*
+ * Profile string for HLSL Shader Model 4 output.
+ */
+#define MOJOSHADER_PROFILE_HLSL "hlsl"
+
+/*
  * Profile string for GLSL: OpenGL high-level shader language output.
  */
 #define MOJOSHADER_PROFILE_GLSL "glsl"
@@ -3528,6 +3533,209 @@
 DECLSPEC void MOJOSHADER_mtlDestroyContext(void);
 
 
+/* D3D11 interface... */
+
+typedef struct MOJOSHADER_d3d11Shader MOJOSHADER_d3d11Shader;
+
+/*
+ * Prepare MojoShader to manage Direct3D 11 shaders.
+ *
+ * You do not need to call this if all you want is MOJOSHADER_parse().
+ *
+ * You must call this once AFTER you have successfully built your D3D11 context.
+ *
+ * As MojoShader requires some memory to be allocated, you may provide a
+ *  custom allocator to this function, which will be used to allocate/free
+ *  memory. They function just like malloc() and free(). We do not use
+ *  realloc(). If you don't care, pass NULL in for the allocator functions.
+ *  If your allocator needs instance-specific data, you may supply it with the
+ *  (malloc_d) parameter. This pointer is passed as-is to your (m) and (f)
+ *  functions.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC int MOJOSHADER_d3d11CreateContext(void *device, void *deviceContext,
+                                           MOJOSHADER_malloc m, MOJOSHADER_free f,
+                                           void *malloc_d);
+
+/*
+ * Get any error state we might have picked up, such as failed shader
+ *  compilation.
+ *
+ * Returns a human-readable string. This string is for debugging purposes, and
+ *  not guaranteed to be localized, coherent, or user-friendly in any way.
+ *  It's for programmers!
+ *
+ * The latest error may remain between calls. New errors replace any existing
+ *  error. Don't check this string for a sign that an error happened, check
+ *  return codes instead and use this for explanation when debugging.
+ *
+ * Do not free the returned string: it's a pointer to a static internal
+ *  buffer. Do not keep the pointer around, either, as it's likely to become
+ *  invalid as soon as you call into MojoShader again.
+ */
+DECLSPEC const char *MOJOSHADER_d3d11GetError(void);
+
+/*
+ * Compile a buffer of Direct3D 9 shader bytecode into a Direct3D 11 shader.
+ *  You still need to link the shader before you may render with it.
+ *
+ *   (mainfn) is the name of the shader's main function.
+ *   (tokenbuf) is a buffer of Direct3D shader bytecode.
+ *   (bufsize) is the size, in bytes, of the bytecode buffer.
+ *   (swiz), (swizcount), (smap), and (smapcount) are passed to
+ *   MOJOSHADER_parse() unmolested.
+ *
+ * Returns NULL on error, or a shader handle on success.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC MOJOSHADER_d3d11Shader *MOJOSHADER_d3d11CompileShader(const char *mainfn,
+                                                               const unsigned char *tokenbuf,
+                                                               const unsigned int bufsize,
+                                                               const MOJOSHADER_swizzle *swiz,
+                                                               const unsigned int swizcount,
+                                                               const MOJOSHADER_samplerMap *smap,
+                                                               const unsigned int smapcount);
+
+/*
+ * Increments a shader's internal refcount. To decrement the refcount, call
+ *  MOJOSHADER_glDeleteShader().
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11ShaderAddRef(MOJOSHADER_d3d11Shader *shader);
+
+/*
+ * Get the MOJOSHADER_parseData structure that was produced from the
+ *  call to MOJOSHADER_d3d11CompileShader().
+ *
+ * This data is read-only, and you should NOT attempt to free it. This
+ *  pointer remains valid until the shader is deleted.
+ */
+DECLSPEC const MOJOSHADER_parseData *MOJOSHADER_d3d11GetShaderParseData(
+                                                MOJOSHADER_d3d11Shader *shader);
+
+/*
+ * This binds individual shaders together, to be linked into a single working
+ *  program once MOJOSHADER_d3d11ProgramReady is called.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11BindShaders(MOJOSHADER_d3d11Shader *vshader,
+                                          MOJOSHADER_d3d11Shader *pshader);
+
+/*
+ * This queries for the shaders currently bound to the active context.
+ *
+ * This function is only for convenience, specifically for compatibility with
+ * the effects API.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11GetBoundShaders(MOJOSHADER_d3d11Shader **vshader,
+                                              MOJOSHADER_d3d11Shader **pshader);
+
+/*
+ * Fills register pointers with pointers that are directly used to push uniform
+ *  data to the D3D11 shader context.
+ *
+ * This function is really just for the effects API, you should NOT be using
+ *  this unless you know every single line of MojoShader from memory.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11MapUniformBufferMemory(float **vsf, int **vsi, unsigned char **vsb,
+                                                     float **psf, int **psi, unsigned char **psb);
+
+/*
+ * Tells the context that you are done with the memory mapped by
+ *  MOJOSHADER_d3d11MapUniformBufferMemory().
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11UnmapUniformBufferMemory();
+
+/*
+ * Return the location of a vertex attribute for the given vertex shader.
+ *
+ * (usage) and (index) map to Direct3D vertex declaration values: COLOR1 would
+ *  be MOJOSHADER_USAGE_COLOR and 1.
+ *
+ * The return value is the index of the attribute to be used when building the
+ *  input layout object.
+ */
+DECLSPEC int MOJOSHADER_d3d11GetVertexAttribLocation(MOJOSHADER_d3d11Shader *vert,
+                                                     MOJOSHADER_usage usage,
+                                                     int index);
+
+/*
+ * Using the given input layout, compiles the vertex shader with input
+ *  parameters that will be compatible with the incoming vertex data.
+ *
+ * (inputLayoutHash) is an application-defined value to differentiate unique
+ *  vertex declarations that will be passed to the vertex shader.
+ *  (elements) is an array of D3D11_INPUT_ELEMENT_DESCs, with (elementCount)
+ *  entries. (bytecode) and (bytecodeLength) will be filled with the final
+ *  compiled D3D11 vertex shader.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11CompileVertexShader(unsigned long long inputLayoutHash,
+                                                  void *elements, int elementCount,
+                                                  void **bytecode, int *bytecodeLength);
+
+/*
+ * Inform MojoShader that it should commit any pending state and prepare the
+ *  final shader program object, linking the input/output parameter data to
+ *  be compatible with the more-struct Shader Model 4 rule set. This must be
+ *  called after you bind shaders and update any inputs, right before you start
+ *  drawing, so any outstanding changes made to the shared constants array (etc)
+ *  can propagate to the shader during this call.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11ProgramReady(unsigned long long inputLayoutHash);
+
+/*
+ * Free the resources of a compiled shader. This will delete the shader object
+ *  and free memory.
+ *
+ * This call is only as thread safe as your D3D11 context! If you call your
+ *  context from multiple threads, you must protect this call with whatever
+ *  thread synchronization technique you have for your other D3D calls.
+ */
+DECLSPEC void MOJOSHADER_d3d11DeleteShader(MOJOSHADER_d3d11Shader *shader);
+
+/*
+ * Deinitialize MojoShader's D3D11 shader management.
+ *
+ * This will clean up resources previously allocated for the active context.
+ *
+ * This will NOT clean up shaders you created! Please destroy all shaders
+ *  before calling this function.
+ */
+DECLSPEC void MOJOSHADER_d3d11DestroyContext(void);
+
+
 /* Effects interface... */
 #include "mojoshader_effects.h"
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mojoshader_d3d11.c	Thu May 21 12:00:55 2020 -0400
@@ -0,0 +1,912 @@
+/**
+ * MojoShader; generate shader programs from bytecode of compiled
+ *  Direct3D shaders.
+ *
+ * Please see the file LICENSE.txt in the source's root directory.
+ *
+ *  This file written by Ryan C. Gordon.
+ */
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN 1
+#include <windows.h> // Include this early to avoid SDL conflicts
+#endif
+
+#define __MOJOSHADER_INTERNAL__ 1
+#include "mojoshader_internal.h"
+
+#if SUPPORT_PROFILE_HLSL
+
+#define D3D11_NO_HELPERS
+#define CINTERFACE
+#define COBJMACROS
+#include <d3d11.h>
+
+#ifndef WINAPI_FAMILY_WINRT
+#define WINAPI_FAMILY_WINRT 0
+#endif
+#if WINAPI_FAMILY_WINRT
+#include <d3dcompiler.h>
+#endif
+
+/* Error state */
+
+static char error_buffer[1024] = { '\0' };
+
+static void set_error(const char *str)
+{
+    snprintf(error_buffer, sizeof (error_buffer), "%s", str);
+} // set_error
+
+static inline void out_of_memory(void)
+{
+    set_error("out of memory");
+} // out_of_memory
+
+/* D3DCompile signature */
+
+typedef HRESULT(WINAPI *PFN_D3DCOMPILE)(
+    LPCVOID pSrcData,
+    SIZE_T SrcDataSize,
+    LPCSTR pSourceName,
+    const D3D_SHADER_MACRO *pDefines,
+    ID3DInclude *pInclude,
+    LPCSTR pEntrypoint,
+    LPCSTR pTarget,
+    UINT Flags1,
+    UINT Flags2,
+    ID3DBlob **ppCode,
+    ID3DBlob **ppErrorMsgs
+);
+
+/* Structs */
+
+typedef struct d3d11ShaderMap
+{
+    void *val;
+    union
+    {
+        struct
+        {
+            uint64 layoutHash;
+            ID3D10Blob *blob;
+        } vertex;
+        struct
+        {
+            MOJOSHADER_d3d11Shader *vshader;
+        } pixel;
+    };
+} d3d11ShaderMap;
+
+typedef struct MOJOSHADER_d3d11Shader
+{
+    const MOJOSHADER_parseData *parseData;
+    uint32 refcount;
+    ID3D11Buffer *ubo;
+    size_t buflen;
+    unsigned char *constantData;
+    unsigned int mapCapacity;
+    unsigned int numMaps;
+    d3d11ShaderMap *shaderMaps;
+} MOJOSHADER_d3d11Shader;
+
+// Max entries for each register file type...
+#define MAX_REG_FILE_F 8192
+#define MAX_REG_FILE_I 2047
+#define MAX_REG_FILE_B 2047
+
+typedef struct MOJOSHADER_d3d11Context
+{
+    // Allocators...
+    MOJOSHADER_malloc malloc_fn;
+    MOJOSHADER_free free_fn;
+    void *malloc_data;
+
+    // The constant register files...
+    // !!! FIXME: Man, it kills me how much memory this takes...
+    // !!! FIXME:  ... make this dynamically allocated on demand.
+    float vs_reg_file_f[MAX_REG_FILE_F * 4];
+    int vs_reg_file_i[MAX_REG_FILE_I * 4];
+    uint8 vs_reg_file_b[MAX_REG_FILE_B];
+    float ps_reg_file_f[MAX_REG_FILE_F * 4];
+    int ps_reg_file_i[MAX_REG_FILE_I * 4];
+    uint8 ps_reg_file_b[MAX_REG_FILE_B];
+
+    // Pointer to the active ID3D11Device.
+    ID3D11Device *device;
+
+    // Pointer to the ID3D11DeviceContext.
+    ID3D11DeviceContext *deviceContext;
+
+    // Currently bound vertex and pixel shaders.
+    MOJOSHADER_d3d11Shader *vertexShader;
+    MOJOSHADER_d3d11Shader *pixelShader;
+    int vertexNeedsBound;
+    int pixelNeedsBound;
+
+    // D3DCompile function pointer.
+    PFN_D3DCOMPILE D3DCompileFunc;
+#if !WINAPI_FAMILY_WINRT
+    HMODULE d3dcompilerDLL;
+#endif
+} MOJOSHADER_d3d11Context;
+
+static MOJOSHADER_d3d11Context *ctx = NULL;
+
+/* Uniform buffer utilities */
+
+static inline int next_highest_alignment(int n)
+{
+    const int align = 16;
+    return align * ((n + align - 1) / align);
+} // next_highest_alignment
+
+static inline void *get_uniform_buffer(MOJOSHADER_d3d11Shader *shader)
+{
+    return (shader == NULL || shader->ubo == NULL) ? NULL : shader->ubo;
+} // get_uniform_buffer
+
+static void update_uniform_buffer(MOJOSHADER_d3d11Shader *shader)
+{
+    if (shader == NULL || shader->ubo == NULL)
+        return;
+
+    float *regF; int *regI; uint8 *regB;
+    if (shader->parseData->shader_type == MOJOSHADER_TYPE_VERTEX)
+    {
+        regF = ctx->vs_reg_file_f;
+        regI = ctx->vs_reg_file_i;
+        regB = ctx->vs_reg_file_b;
+    } // if
+    else
+    {
+        regF = ctx->ps_reg_file_f;
+        regI = ctx->ps_reg_file_i;
+        regB = ctx->ps_reg_file_b;
+    } // else
+
+    // Update the buffer contents
+    int needsUpdate = 0;
+    size_t offset = 0;
+    for (int i = 0; i < shader->parseData->uniform_count; i++)
+    {
+        if (shader->parseData->uniforms[i].constant)
+            continue;
+
+        int idx = shader->parseData->uniforms[i].index;
+        int arrayCount = shader->parseData->uniforms[i].array_count;
+
+        void *src = NULL;
+        void *dst = NULL;
+        size_t size = arrayCount ? arrayCount : 1;
+
+        switch (shader->parseData->uniforms[i].type)
+        {
+            case MOJOSHADER_UNIFORM_FLOAT:
+                src = &regF[4 * idx];
+                dst = shader->constantData + offset;
+                size *= 16;
+                break;
+
+            case MOJOSHADER_UNIFORM_INT:
+                src = &regI[4 * idx];
+                dst = shader->constantData + offset;
+                size *= 16;
+                break;
+
+            case MOJOSHADER_UNIFORM_BOOL:
+                src = &regB[idx];
+                dst = shader->constantData + offset;
+                break;
+
+            default:
+                assert(0); // This should never happen.
+                break;
+        } // switch
+
+        if (memcmp(dst, src, size) != 0)
+        {
+            memcpy(dst, src, size);
+            needsUpdate = 1;
+        } // if
+
+        offset += size;
+    } // for
+
+    if (needsUpdate)
+    {
+        // Map the buffer
+        D3D11_MAPPED_SUBRESOURCE res;
+        ID3D11DeviceContext_Map((ID3D11DeviceContext*) ctx->deviceContext,
+                                (ID3D11Resource*) shader->ubo, 0,
+                                D3D11_MAP_WRITE_DISCARD, 0, &res);
+
+        // Copy the contents
+        memcpy(res.pData, shader->constantData, shader->buflen);
+
+        // Unmap the buffer
+        ID3D11DeviceContext_Unmap(
+            (ID3D11DeviceContext*) ctx->deviceContext,
+            (ID3D11Resource*) shader->ubo,
+            0
+        );
+    } // if
+} // update_uniform_buffer
+
+static inline void expand_map(MOJOSHADER_d3d11Shader *shader)
+{
+    if (shader->numMaps == shader->mapCapacity)
+    {
+        d3d11ShaderMap *newMap = (d3d11ShaderMap *) ctx->malloc_fn(
+            sizeof(d3d11ShaderMap) * shader->mapCapacity * 2,
+            ctx->malloc_data
+        );
+        memcpy(newMap, shader->shaderMaps,
+            sizeof(d3d11ShaderMap) * shader->mapCapacity);
+        shader->mapCapacity *= 2;
+        ctx->free_fn(shader->shaderMaps, ctx->malloc_data);
+        shader->shaderMaps = newMap;
+        newMap = NULL;
+    } // if
+} // expand_map
+
+static inline int element_is_uint(DXGI_FORMAT format)
+{
+    return  format == DXGI_FORMAT_R32G32B32A32_UINT ||
+            format == DXGI_FORMAT_R32G32B32_UINT ||
+            format == DXGI_FORMAT_R16G16B16A16_UINT ||
+            format == DXGI_FORMAT_R32G32_UINT ||
+            format == DXGI_FORMAT_R10G10B10A2_UINT ||
+            format == DXGI_FORMAT_R8G8B8A8_UINT ||
+            format == DXGI_FORMAT_R16G16_UINT ||
+            format == DXGI_FORMAT_R32_UINT ||
+            format == DXGI_FORMAT_R8G8_UINT ||
+            format == DXGI_FORMAT_R16_UINT ||
+            format == DXGI_FORMAT_R8_UINT;
+} // element_is_uint
+
+static inline int element_is_int(DXGI_FORMAT format)
+{
+    return  format == DXGI_FORMAT_R32G32B32A32_SINT ||
+            format == DXGI_FORMAT_R32G32B32_SINT ||
+            format == DXGI_FORMAT_R16G16B16A16_SINT ||
+            format == DXGI_FORMAT_R32G32_SINT ||
+            format == DXGI_FORMAT_R8G8B8A8_SINT ||
+            format == DXGI_FORMAT_R16G16_SINT ||
+            format == DXGI_FORMAT_R32_SINT ||
+            format == DXGI_FORMAT_R8G8_SINT ||
+            format == DXGI_FORMAT_R16_SINT ||
+            format == DXGI_FORMAT_R8_SINT;
+} // element_is_int
+
+/* Shader Compilation Utilities */
+
+static ID3D11VertexShader *compileVertexShader(MOJOSHADER_d3d11Shader *shader,
+                                               const char *src, int src_len,
+                                               ID3D10Blob **blob)
+{
+    const MOJOSHADER_parseData *pd = shader->parseData;
+    HRESULT result = ctx->D3DCompileFunc(src, src_len, pd->mainfn,
+                                         NULL, NULL, pd->mainfn, "vs_4_0",
+                                         0, 0, blob, blob);
+
+    if (result < 0)
+    {
+        set_error((const char *) ID3D10Blob_GetBufferPointer(*blob));
+        ID3D10Blob_Release(*blob);
+        return NULL;
+    } // if
+
+    void *bytecode = ID3D10Blob_GetBufferPointer(*blob);
+    int bytecodeLength = ID3D10Blob_GetBufferSize(*blob);
+    ID3D11VertexShader *ret = NULL;
+    ID3D11Device_CreateVertexShader(ctx->device, bytecode, bytecodeLength,
+                                    NULL, &ret);
+    return ret;
+} // compileVertexShader
+
+static void replaceVarname(const char *find, const char *replace,
+                           const char **source)
+{
+    const char *srcbuf = *source;
+    size_t find_len = strlen(find);
+    size_t replace_len = strlen(replace);
+
+    #define IS_PARTIAL_TOKEN(token) \
+        (isalnum(*(token + find_len)) || isalnum(*(token-1)))
+
+    // How many times does `find` occur in the source buffer?
+    int count = 0;
+    char *ptr = (char *) strstr(srcbuf, find);
+    while (ptr != NULL)
+    {
+        if (!IS_PARTIAL_TOKEN(ptr))
+            count++;
+        ptr = strstr(ptr + find_len, find);
+    } // while
+
+    // How big should we make the new text buffer?
+    size_t oldlen = strlen(srcbuf) + 1;
+    size_t newlen = oldlen + (count * (replace_len - find_len));
+
+    // Easy case; just find/replace in the original buffer
+    if (newlen == oldlen)
+    {
+        ptr = (char *) strstr(srcbuf, find);
+        while (ptr != NULL)
+        {
+            if (!IS_PARTIAL_TOKEN(ptr))
+                memcpy(ptr, replace, replace_len);
+            ptr = strstr(ptr + find_len, find);
+        } // while
+        return;
+    } // if
+
+    // Allocate a new buffer
+    char *newbuf = (char *) ctx->malloc_fn(newlen, ctx->malloc_data);
+    memset(newbuf, '\0', newlen);
+
+    // Find + replace
+    char *prev_ptr = (char *) srcbuf;
+    char *curr_ptr = (char *) newbuf;
+    ptr = (char*) strstr(srcbuf, find);
+    while (ptr != NULL)
+    {
+        memcpy(curr_ptr, prev_ptr, ptr - prev_ptr);
+        curr_ptr += ptr - prev_ptr;
+
+        if (!IS_PARTIAL_TOKEN(ptr))
+        {
+            memcpy(curr_ptr, replace, replace_len);
+            curr_ptr += replace_len;
+        } // if
+        else
+        {
+            // Don't accidentally eat partial tokens...
+            memcpy(curr_ptr, find, find_len);
+            curr_ptr += find_len;
+        } // else
+
+        prev_ptr = ptr + find_len;
+        ptr = strstr(prev_ptr, find);
+    } // while
+
+    #undef IS_PARTIAL_TOKEN
+
+    // Copy the remaining part of the source buffer
+    memcpy(curr_ptr, prev_ptr, (srcbuf + oldlen) - prev_ptr);
+
+    // Free the source buffer
+    ctx->free_fn((void *) srcbuf, ctx->malloc_data);
+
+    // Point the source parameter to the new buffer
+    *source = newbuf;
+} // replaceVarname
+
+static char *rewritePixelShader(MOJOSHADER_d3d11Shader *vshader,
+                                MOJOSHADER_d3d11Shader *pshader)
+{
+    const MOJOSHADER_parseData *vpd = vshader->parseData;
+    const MOJOSHADER_parseData *ppd = pshader->parseData;
+    const char *_Output = "_Output" ENDLINE_STR "{" ENDLINE_STR;
+    const char *_Input = "_Input" ENDLINE_STR "{" ENDLINE_STR;
+    const char *vsrc = vpd->output;
+    const char *psrc = ppd->output;
+    const char *a, *b, *vout, *pstart, *vface, *pend;
+    size_t substr_len;
+    char *pfinal;
+
+    #define MAKE_STRBUF(buf) \
+        substr_len = b - a; \
+        buf = (const char *) ctx->malloc_fn(substr_len + 1, ctx->malloc_data); \
+        memset((void *) buf, '\0', substr_len + 1); \
+        memcpy((void *) buf, a, substr_len);
+
+    // Copy the vertex function's output struct into a buffer
+    a = strstr(vsrc, _Output) + strlen(_Output);
+    b = a;
+    while (*(b++) != '}');
+    b--;
+    MAKE_STRBUF(vout)
+
+    // Split up the pixel shader text...
+
+    // ...everything up to the input contents...
+    a = psrc;
+    b = strstr(psrc, _Input) + strlen(_Input);
+    MAKE_STRBUF(pstart)
+
+    // ...everything after the input contents.
+    a = b;
+    while (*(a++) != '}');
+    a--;
+    while (*(b++) != '\0');
+    MAKE_STRBUF(pend)
+
+    // Find matching semantics
+    int i, j;
+    int vfaceidx = -1;
+    const char *pvarname, *vvarname;
+    for (i = 0; i < ppd->attribute_count; i++)
+    {
+        for (j = 0; j < vpd->output_count; j++)
+        {
+            if (ppd->attributes[i].usage == vpd->outputs[j].usage &&
+                ppd->attributes[i].index == vpd->outputs[j].index)
+            {
+                pvarname = ppd->attributes[i].name;
+                vvarname = vpd->outputs[j].name;
+                if (strcmp(pvarname, vvarname) != 0)
+                    replaceVarname(pvarname, vvarname, &pend);
+            } // if
+            else if (strcmp(ppd->attributes[i].name, "vPos") == 0 &&
+                     vpd->outputs[j].usage == MOJOSHADER_USAGE_POSITION &&
+                     vpd->outputs[j].index == 0)
+            {
+                pvarname = ppd->attributes[i].name;
+                vvarname = vpd->outputs[j].name;
+                if (strcmp(pvarname, vvarname) != 0)
+                    replaceVarname(pvarname, vvarname, &pend);
+            } // else if
+        } // for
+
+        if (strcmp(ppd->attributes[i].name, "vFace") == 0)
+            vfaceidx = i;
+    } // for
+
+    // Special handling for VFACE
+    vface = (vfaceidx != -1) ? "\tbool m_vFace : SV_IsFrontFace;\n" : "";
+
+    // Concatenate the shader pieces together
+    substr_len = strlen(pstart) + strlen(vout) + strlen(vface) + strlen(pend);
+    pfinal = (char *) ctx->malloc_fn(substr_len + 1, ctx->malloc_data);
+    memset((void *) pfinal, '\0', substr_len + 1);
+    memcpy(pfinal, pstart, strlen(pstart));
+    memcpy(pfinal + strlen(pstart), vout, strlen(vout));
+    memcpy(pfinal + strlen(pstart) + strlen(vout), vface, strlen(vface));
+    memcpy(pfinal + strlen(pstart) + strlen(vout) + strlen(vface), pend, strlen(pend));
+
+    // Free the temporary buffers
+    ctx->free_fn((void *) vout, ctx->malloc_data);
+    ctx->free_fn((void *) pstart, ctx->malloc_data);
+    ctx->free_fn((void *) pend, ctx->malloc_data);
+
+    #undef MAKE_STRBUF
+
+    return pfinal;
+} // spliceVertexShaderInput
+
+static ID3D11PixelShader *compilePixelShader(MOJOSHADER_d3d11Shader *vshader,
+                                             MOJOSHADER_d3d11Shader *pshader)
+{
+    ID3D11PixelShader *retval = NULL;
+    const char *source;
+    ID3DBlob *blob;
+    HRESULT result;
+    int needs_free;
+
+    if (pshader->parseData->attribute_count > 0)
+    {
+        source = rewritePixelShader(vshader, pshader);
+        needs_free = 1;
+    } // if
+    else
+    {
+        source = pshader->parseData->output;
+        needs_free = 0;
+    } // else
+
+    result = ctx->D3DCompileFunc(source, strlen(source),
+                                 pshader->parseData->mainfn, NULL, NULL,
+                                 pshader->parseData->mainfn, "ps_4_0", 0, 0,
+                                 &blob, &blob);
+
+    if (result < 0)
+    {
+        set_error((const char *) ID3D10Blob_GetBufferPointer(blob));
+        ctx->free_fn((void *) source, ctx->malloc_data);
+        return NULL;
+    } // if
+
+    ID3D11Device_CreatePixelShader(ctx->device,
+                                   ID3D10Blob_GetBufferPointer(blob),
+                                   ID3D10Blob_GetBufferSize(blob),
+                                   NULL, &retval);
+
+    ID3D10Blob_Release(blob);
+    if (needs_free)
+        ctx->free_fn((void *) source, ctx->malloc_data);
+    return retval;
+} // compilePixelShader
+
+/* Public API */
+
+int MOJOSHADER_d3d11CreateContext(void *device, void *deviceContext,
+                                  MOJOSHADER_malloc m, MOJOSHADER_free f,
+                                  void *malloc_d)
+{
+    assert(ctx == NULL);
+
+    if (m == NULL) m = MOJOSHADER_internal_malloc;
+    if (f == NULL) f = MOJOSHADER_internal_free;
+
+    ctx = (MOJOSHADER_d3d11Context *) m(sizeof(MOJOSHADER_d3d11Context), malloc_d);
+    if (ctx == NULL)
+    {
+        out_of_memory();
+        goto init_fail;
+    } // if
+
+    memset(ctx, '\0', sizeof (MOJOSHADER_d3d11Context));
+    ctx->malloc_fn = m;
+    ctx->free_fn = f;
+    ctx->malloc_data = malloc_d;
+
+    // Store references to the D3D device and immediate context
+    ctx->device = (ID3D11Device*) device;
+    ctx->deviceContext = (ID3D11DeviceContext*) deviceContext;
+
+    // Grab the D3DCompile function pointer
+#if WINAPI_FAMILY_WINRT
+    ctx->D3DCompileFunc = D3DCompile;
+#else
+    ctx->d3dcompilerDLL = LoadLibrary("d3dcompiler_47.dll");
+    assert(ctx->d3dcompilerDLL != NULL);
+    ctx->D3DCompileFunc = (PFN_D3DCOMPILE) GetProcAddress(ctx->d3dcompilerDLL,
+                                                          "D3DCompile");
+#endif /* WINAPI_FAMILY_WINRT */
+
+    return 0;
+
+init_fail:
+    if (ctx != NULL)
+        f(ctx, malloc_d);
+    return -1;
+} // MOJOSHADER_d3d11CreateContext
+
+void MOJOSHADER_d3d11DestroyContext(void)
+{
+#if !WINAPI_FAMILY_WINRT
+    FreeLibrary(ctx->d3dcompilerDLL);
+#endif
+    ctx->free_fn(ctx, ctx->malloc_data);
+    ctx = NULL;
+} // MOJOSHADER_d3d11DestroyContext
+
+MOJOSHADER_d3d11Shader *MOJOSHADER_d3d11CompileShader(const char *mainfn,
+                                                      const unsigned char *tokenbuf,
+                                                      const unsigned int bufsize,
+                                                      const MOJOSHADER_swizzle *swiz,
+                                                      const unsigned int swizcount,
+                                                      const MOJOSHADER_samplerMap *smap,
+                                                      const unsigned int smapcount)
+{
+    MOJOSHADER_malloc m = ctx->malloc_fn;
+    MOJOSHADER_free f = ctx->free_fn;
+    void *d = ctx->malloc_data;
+    int i;
+
+    const MOJOSHADER_parseData *pd = MOJOSHADER_parse("hlsl", mainfn, tokenbuf,
+                                                     bufsize, swiz, swizcount,
+                                                     smap, smapcount, m, f, d);
+
+    if (pd->error_count > 0)
+    {
+        // !!! FIXME: put multiple errors in the buffer? Don't use
+        // !!! FIXME:  MOJOSHADER_d3d11GetError() for this?
+        set_error(pd->errors[0].error);
+        goto compile_shader_fail;
+    } // if
+
+    MOJOSHADER_d3d11Shader *retval = (MOJOSHADER_d3d11Shader *) m(sizeof(MOJOSHADER_d3d11Shader), d);
+    if (retval == NULL)
+        goto compile_shader_fail;
+
+    retval->parseData = pd;
+    retval->refcount = 1;
+    retval->ubo = NULL;
+    retval->constantData = NULL;
+    retval->buflen = 0;
+    retval->numMaps = 0;
+
+    // Allocate shader maps
+    retval->mapCapacity = 4; // arbitrary!
+    retval->shaderMaps = (d3d11ShaderMap *) m(retval->mapCapacity * sizeof(d3d11ShaderMap), d);
+    if (retval->shaderMaps == NULL)
+        goto compile_shader_fail;
+
+    memset(retval->shaderMaps, '\0', retval->mapCapacity * sizeof(d3d11ShaderMap));
+
+    // Create the uniform buffer, if needed
+    if (pd->uniform_count > 0)
+    {
+        // Calculate how big we need to make the buffer
+        for (i = 0; i < pd->uniform_count; i++)
+        {
+            int arrayCount = pd->uniforms[i].array_count;
+            int uniformSize = 16;
+            if (pd->uniforms[i].type == MOJOSHADER_UNIFORM_BOOL)
+                uniformSize = 1;
+            retval->buflen += (arrayCount ? arrayCount : 1) * uniformSize;
+        } // for
+
+        D3D11_BUFFER_DESC bdesc;
+        bdesc.ByteWidth = next_highest_alignment(retval->buflen);
+        bdesc.Usage = D3D11_USAGE_DYNAMIC;
+        bdesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+        bdesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+        bdesc.MiscFlags = 0;
+        bdesc.StructureByteStride = 0;
+        ID3D11Device_CreateBuffer((ID3D11Device*) ctx->device, &bdesc, NULL,
+                                  (ID3D11Buffer**) &retval->ubo);
+
+        // Additionally allocate a CPU-side staging buffer
+        retval->constantData = (unsigned char *) m(retval->buflen, d);
+        memset(retval->constantData, '\0', retval->buflen);
+    } // if
+
+    return retval;
+
+compile_shader_fail:
+    MOJOSHADER_freeParseData(pd);
+    return NULL;
+} // MOJOSHADER_d3d11CompileShader
+
+void MOJOSHADER_d3d11ShaderAddRef(MOJOSHADER_d3d11Shader *shader)
+{
+    if (shader != NULL)
+        shader->refcount++;
+} // MOJOSHADER_d3d11ShaderAddRef
+
+void MOJOSHADER_d3d11DeleteShader(MOJOSHADER_d3d11Shader *shader)
+{
+    if (shader != NULL)
+    {
+        if (shader->refcount > 1)
+            shader->refcount--;
+        else
+        {
+            if (shader->ubo != NULL)
+            {
+                ID3D11Buffer_Release((ID3D11Buffer*) shader->ubo);
+                ctx->free_fn(shader->constantData, ctx->malloc_data);
+            } // if
+
+            if (shader->parseData->shader_type == MOJOSHADER_TYPE_VERTEX)
+            {
+                for (int i = 0; i < shader->numMaps; i++)
+                {
+                    ID3D11VertexShader_Release((ID3D11VertexShader *) shader->shaderMaps[i].val);
+                    ID3D10Blob_Release(shader->shaderMaps[i].vertex.blob);
+                } // for
+            } // if
+            else if (shader->parseData->shader_type == MOJOSHADER_TYPE_PIXEL)
+            {
+                for (int i = 0; i < shader->numMaps; i++)
+                    ID3D11PixelShader_Release((ID3D11PixelShader *) shader->shaderMaps[i].val);
+            } // else if
+
+            ctx->free_fn(shader->shaderMaps, ctx->malloc_data);
+            shader->shaderMaps = NULL;
+            MOJOSHADER_freeParseData(shader->parseData);
+            ctx->free_fn(shader, ctx->malloc_data);
+        } // else
+    } // if
+} // MOJOSHADER_d3d11DeleteShader
+
+const MOJOSHADER_parseData *MOJOSHADER_d3d11GetShaderParseData(
+                                                MOJOSHADER_d3d11Shader *shader)
+{
+    return (shader != NULL) ? shader->parseData : NULL;
+} // MOJOSHADER_d3d11GetParseData
+
+void MOJOSHADER_d3d11BindShaders(MOJOSHADER_d3d11Shader *vshader,
+                                 MOJOSHADER_d3d11Shader *pshader)
+{
+    // Use the last bound shaders in case of NULL
+    if (vshader != NULL)
+    {
+        ctx->vertexShader = vshader;
+        ctx->vertexNeedsBound = 1;
+    } // if
+
+    if (pshader != NULL)
+    {
+        ctx->pixelShader = pshader;
+        ctx->pixelNeedsBound = 1;
+    } // if
+} // MOJOSHADER_d3d11BindShaders
+
+void MOJOSHADER_d3d11GetBoundShaders(MOJOSHADER_d3d11Shader **vshader,
+                                     MOJOSHADER_d3d11Shader **pshader)
+{
+    *vshader = ctx->vertexShader;
+    *pshader = ctx->pixelShader;
+} // MOJOSHADER_d3d11GetBoundShaders
+
+void MOJOSHADER_d3d11MapUniformBufferMemory(float **vsf, int **vsi, unsigned char **vsb,
+                                            float **psf, int **psi, unsigned char **psb)
+{
+    *vsf = ctx->vs_reg_file_f;
+    *vsi = ctx->vs_reg_file_i;
+    *vsb = ctx->vs_reg_file_b;
+    *psf = ctx->ps_reg_file_f;
+    *psi = ctx->ps_reg_file_i;
+    *psb = ctx->ps_reg_file_b;
+} // MOJOSHADER_d3d11MapUniformBufferMemory
+
+void MOJOSHADER_d3d11UnmapUniformBufferMemory()
+{
+    /* This has nothing to do with unmapping memory
+     * and everything to do with updating uniform
+     * buffers with the latest parameter contents.
+     */
+    MOJOSHADER_d3d11Shader *vs, *ps;
+    MOJOSHADER_d3d11GetBoundShaders(&vs, &ps);
+    update_uniform_buffer(vs);
+    update_uniform_buffer(ps);
+} // MOJOSHADER_d3d11UnmapUniformBufferMemory
+
+int MOJOSHADER_d3d11GetVertexAttribLocation(MOJOSHADER_d3d11Shader *vert,
+                                            MOJOSHADER_usage usage, int index)
+{
+    if (vert == NULL)
+        return -1;
+
+    for (int i = 0; i < vert->parseData->attribute_count; i++)
+    {
+        if (vert->parseData->attributes[i].usage == usage &&
+            vert->parseData->attributes[i].index == index)
+        {
+            return i;
+        } // if
+    } // for
+
+    // failure, couldn't find requested attribute
+    return -1;
+} // MOJOSHADER_d3d11GetVertexAttribLocation
+
+void MOJOSHADER_d3d11CompileVertexShader(unsigned long long inputLayoutHash,
+                                         void* elements, int elementCount,
+                                         void **bytecode, int *bytecodeLength)
+{
+    MOJOSHADER_d3d11Shader *vshader = ctx->vertexShader;
+    ID3D10Blob *blob;
+
+    // Don't compile if there's already a mapping for this layout.
+    for (int i = 0; i < vshader->numMaps; i++)
+    {
+        if (inputLayoutHash == vshader->shaderMaps[i].vertex.layoutHash)
+        {
+            blob = vshader->shaderMaps[i].vertex.blob;
+            *bytecode = ID3D10Blob_GetBufferPointer(blob);
+            *bytecodeLength = ID3D10Blob_GetBufferSize(blob);
+            return;
+        } // if
+    } // for
+
+    // Check for and replace non-float types
+    D3D11_INPUT_ELEMENT_DESC *d3dElements = (D3D11_INPUT_ELEMENT_DESC*) elements;
+    const char *origSource = vshader->parseData->output;
+    int srcLength = vshader->parseData->output_len;
+    char *newSource = (char*) origSource;
+    for (int i = 0; i < elementCount; i += 1)
+    {
+        D3D11_INPUT_ELEMENT_DESC e = d3dElements[i];
+
+        const char *replace;
+        if (element_is_uint(e.Format))
+            replace = " uint4";
+        else if (element_is_int(e.Format))
+            replace = "  int4";
+        else
+            replace = NULL;
+
+        if (replace != NULL)
+        {
+            char sem[16];
+            memset(sem, '\0', sizeof(sem));
+            snprintf(sem, sizeof(sem), "%s%d", e.SemanticName, e.SemanticIndex);
+            // !!! FIXME: POSITIONT has no index. What to do? -caleb
+
+            if (newSource == origSource)
+            {
+                newSource = (char *) ctx->malloc_fn(srcLength + 1,
+                                                    ctx->malloc_data);
+                strcpy(newSource, origSource);
+            } // if
+
+            char *ptr = strstr(newSource, sem);
+            assert(ptr != NULL && "Could not find semantic in shader source!");
+
+            int spaces = 0;
+            while (spaces < 3)
+                if (*(--ptr) == ' ') spaces++;
+            memcpy(ptr - strlen("float4"), replace, strlen(replace));
+        } // if
+    } // for
+
+    // Expand the map array, if needed
+    expand_map(vshader);
+
+    // Add the new mapping
+    vshader->shaderMaps[vshader->numMaps].vertex.layoutHash = inputLayoutHash;
+    ID3D11VertexShader *vs = compileVertexShader(vshader, newSource,
+                                                 srcLength, &blob);
+    vshader->shaderMaps[ctx->vertexShader->numMaps].val = vs;
+    vshader->shaderMaps[ctx->vertexShader->numMaps].vertex.blob = blob;
+    ctx->vertexShader->numMaps++;
+    assert(vs != NULL);
+
+    // Return the bytecode info
+    *bytecode = ID3D10Blob_GetBufferPointer(blob);
+    *bytecodeLength = ID3D10Blob_GetBufferSize(blob);
+} // MOJOSHADER_d3d11CompileVertexShader
+
+void MOJOSHADER_d3d11ProgramReady(unsigned long long inputLayoutHash)
+{
+    MOJOSHADER_d3d11Shader *vshader = ctx->vertexShader;
+    MOJOSHADER_d3d11Shader *pshader = ctx->pixelShader;
+
+    // Vertex shader...
+    if (ctx->vertexNeedsBound)
+    {
+        ID3D11VertexShader *realVS = NULL;
+        for (int i = 0; i < vshader->numMaps; i++)
+        {
+            if (inputLayoutHash == vshader->shaderMaps[i].vertex.layoutHash)
+            {
+                realVS = (ID3D11VertexShader *) vshader->shaderMaps[i].val;
+                break;
+            } // if
+        } // for
+        assert(realVS != NULL);
+        ID3D11DeviceContext_VSSetShader(ctx->deviceContext, realVS, NULL, 0);
+        ID3D11DeviceContext_VSSetConstantBuffers(ctx->deviceContext, 0, 1,
+                                                 &vshader->ubo);
+        ctx->vertexNeedsBound = 0;
+    } // if
+
+    // Pixel shader...
+    if (ctx->pixelNeedsBound)
+    {
+        // Is there already a mapping for the current vertex shader?
+        ID3D11PixelShader *realPS = NULL;
+        for (int i = 0; i < pshader->numMaps; i++)
+        {
+            if (pshader->shaderMaps[i].pixel.vshader == vshader)
+            {
+                realPS = (ID3D11PixelShader *) pshader->shaderMaps[i].val;
+                break;
+            } // if
+        } // for
+
+        // We have to create a new vertex/pixel shader mapping...
+        if (realPS == NULL)
+        {
+            // Expand the map array, if needed
+            expand_map(pshader);
+
+            // Add the new mapping
+            pshader->shaderMaps[pshader->numMaps].pixel.vshader = vshader;
+            realPS = compilePixelShader(vshader, pshader);
+            pshader->shaderMaps[pshader->numMaps].val = realPS;
+            pshader->numMaps++;
+            assert(realPS != NULL);
+        } // if
+
+        ID3D11DeviceContext_PSSetShader(ctx->deviceContext, realPS, NULL, 0);
+        ID3D11DeviceContext_PSSetConstantBuffers(ctx->deviceContext, 0, 1,
+                                                 &pshader->ubo);
+        ctx->pixelNeedsBound = 0;
+    } // if
+} // MOJOSHADER_d3d11ProgramReady
+
+const char *MOJOSHADER_d3d11GetError(void)
+{
+    return error_buffer;
+} // MOJOSHADER_d3d11GetError
+
+#endif /* SUPPORT_PROFILE_HLSL */
+
+// end of mojoshader_d3d11.c ...
--- a/mojoshader_internal.h	Mon May 04 09:55:32 2020 -0400
+++ b/mojoshader_internal.h	Thu May 21 12:00:55 2020 -0400
@@ -131,6 +131,10 @@
 #define SUPPORT_PROFILE_BYTECODE 1
 #endif
 
+#ifndef SUPPORT_PROFILE_HLSL
+#define SUPPORT_PROFILE_HLSL 1
+#endif
+
 #ifndef SUPPORT_PROFILE_GLSL
 #define SUPPORT_PROFILE_GLSL 1
 #endif
--- a/profiles/mojoshader_profile.h	Mon May 04 09:55:32 2020 -0400
+++ b/profiles/mojoshader_profile.h	Thu May 21 12:00:55 2020 -0400
@@ -165,12 +165,12 @@
     int predicated;
     int uses_pointsize;
     int uses_fog;
+    int need_max_float;
 
     // !!! FIXME: move these into SUPPORT_PROFILE sections.
     int glsl_generated_lit_helper;
     int glsl_generated_texlod_setup;
     int glsl_generated_texm3x3spec_helper;
-    int glsl_need_max_float;
     int arb1_wrote_position;
     // !!! FIXME: move these into SUPPORT_PROFILE sections.
 
@@ -213,6 +213,10 @@
 #if SUPPORT_PROFILE_GLSPIRV
     int profile_supports_glspirv;
 #endif
+
+#if SUPPORT_PROFILE_HLSL
+    char hlsl_outpos_name[16];
+#endif
 } Context;
 
 // Use these macros so we can remove all bits of these profiles from the build.
--- a/profiles/mojoshader_profile_glsl.c	Mon May 04 09:55:32 2020 -0400
+++ b/profiles/mojoshader_profile_glsl.c	Thu May 21 12:00:55 2020 -0400
@@ -641,7 +641,7 @@
     if (shader_is_vertex(ctx))
         output_line(ctx, "uniform float vpFlip;");
 #endif
-    if (ctx->glsl_need_max_float)
+    if (ctx->need_max_float)
         output_line(ctx, "const float FLT_MAX = 1e38;");
     pop_output(ctx);
 } // emit_GLSL_finalize
@@ -1196,7 +1196,7 @@
 {
     char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
     char code[128];
-    ctx->glsl_need_max_float = 1;
+    ctx->need_max_float = 1;
     make_GLSL_destarg_assign(ctx, code, sizeof (code), "(%s == 0.0) ? FLT_MAX : 1.0 / %s", src0, src0);
     output_line(ctx, "%s", code);
 } // emit_GLSL_RCP
@@ -1205,7 +1205,7 @@
 {
     char src0[64]; make_GLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
     char code[128];
-    ctx->glsl_need_max_float = 1;
+    ctx->need_max_float = 1;
     make_GLSL_destarg_assign(ctx, code, sizeof (code), "(%s == 0.0) ? FLT_MAX : inversesqrt(abs(%s))", src0, src0);
     output_line(ctx, "%s", code);
 } // emit_GLSL_RSQ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/profiles/mojoshader_profile_hlsl.c	Thu May 21 12:00:55 2020 -0400
@@ -0,0 +1,2271 @@
+/**
+ * MojoShader; generate shader programs from bytecode of compiled
+ *  Direct3D shaders.
+ *
+ * Please see the file LICENSE.txt in the source's root directory.
+ *
+ *  This file written by Ryan C. Gordon.
+ */
+
+#define __MOJOSHADER_INTERNAL__ 1
+#include "mojoshader_profile.h"
+
+#pragma GCC visibility push(hidden)
+
+// !!! FIXME: A lot of this is cut-and-paste from the GLSL/Metal versions.
+#if SUPPORT_PROFILE_HLSL
+
+#define EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(op) \
+    void emit_HLSL_##op(Context *ctx) { \
+        fail(ctx, #op " unimplemented in hlsl profile"); \
+    }
+
+static inline const char *get_HLSL_register_string(Context *ctx,
+                        const RegisterType regtype, const int regnum,
+                        char *regnum_str, const size_t regnum_size)
+{
+    // turns out these are identical at the moment.
+    return get_D3D_register_string(ctx,regtype,regnum,regnum_str,regnum_size);
+} // get_HLSL_register_string
+
+const char *get_HLSL_uniform_type(Context *ctx, const RegisterType rtype)
+{
+    switch (rtype)
+    {
+        case REG_TYPE_CONST: return "float4";
+        case REG_TYPE_CONSTINT: return "int4";
+        case REG_TYPE_CONSTBOOL: return "bool";
+        default: fail(ctx, "BUG: used a uniform we don't know how to define.");
+    } // switch
+
+    return NULL;
+} // get_HLSL_uniform_type
+
+const char *get_HLSL_varname_in_buf(Context *ctx, RegisterType rt,
+                                    int regnum, char *buf,
+                                    const size_t len)
+{
+    char regnum_str[16];
+    const char *regtype_str = get_HLSL_register_string(ctx, rt, regnum,
+                                              regnum_str, sizeof (regnum_str));
+    snprintf(buf,len,"%s%s", regtype_str, regnum_str);
+    return buf;
+} // get_HLSL_varname_in_buf
+
+const char *get_HLSL_varname(Context *ctx, RegisterType rt, int regnum)
+{
+    char buf[64];
+    get_HLSL_varname_in_buf(ctx, rt, regnum, buf, sizeof(buf));
+    return StrDup(ctx, buf);
+} // get_HLSL_varname
+
+static inline const char *get_HLSL_const_array_varname_in_buf(Context *ctx,
+                                                const int base, const int size,
+                                                char *buf, const size_t buflen)
+{
+    snprintf(buf, buflen, "const_array_%d_%d", base, size);
+    return buf;
+} // get_HLSL_const_array_varname_in_buf
+
+const char *get_HLSL_const_array_varname(Context *ctx, int base, int size)
+{
+    char buf[64];
+    get_HLSL_const_array_varname_in_buf(ctx, base, size, buf, sizeof(buf));
+    return StrDup(ctx, buf);
+} // get_HLSL_const_array_varname
+
+static inline const char *get_HLSL_input_array_varname(Context *ctx,
+                                                char *buf, const size_t buflen)
+{
+    snprintf(buf, buflen, "%s", "vertex_input_array");
+    return buf;
+} // get_HLSL_input_array_varname
+
+const char *get_HLSL_uniform_array_varname(Context *ctx,
+                                           const RegisterType regtype,
+                                           char *buf, const size_t len)
+{
+    const char *type = get_HLSL_uniform_type(ctx, regtype);
+    snprintf(buf, len, "uniforms_%s", type);
+    return buf;
+} // get_HLSL_uniform_array_varname
+
+const char *get_HLSL_destarg_varname(Context *ctx, char *buf, size_t len)
+{
+    const DestArgInfo *arg = &ctx->dest_arg;
+    return get_HLSL_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, len);
+} // get_HLSL_destarg_varname
+
+const char *get_HLSL_srcarg_varname(Context *ctx, const size_t idx,
+                                    char *buf, size_t len)
+{
+    if (idx >= STATICARRAYLEN(ctx->source_args))
+    {
+        fail(ctx, "Too many source args");
+        *buf = '\0';
+        return buf;
+    } // if
+
+    const SourceArgInfo *arg = &ctx->source_args[idx];
+    return get_HLSL_varname_in_buf(ctx, arg->regtype, arg->regnum, buf, len);
+} // get_HLSL_srcarg_varname
+
+const char *make_HLSL_destarg_assign(Context *, char *, const size_t,
+                                     const char *, ...) ISPRINTF(4,5);
+
+const char *make_HLSL_destarg_assign(Context *ctx, char *buf,
+                                     const size_t buflen,
+                                     const char *fmt, ...)
+{
+    int need_parens = 0;
+    const DestArgInfo *arg = &ctx->dest_arg;
+
+    if (arg->writemask == 0)
+    {
+        *buf = '\0';
+        return buf;  // no writemask? It's a no-op.
+    } // if
+
+    const char *clampleft = "";
+    const char *clampright = "";
+    if (arg->result_mod & MOD_SATURATE)
+    {
+        clampleft = "saturate(";
+        clampright = ")";
+    } // if
+
+    // MSDN says MOD_PP is a hint and many implementations ignore it. So do we.
+
+    // CENTROID only allowed in DCL opcodes, which shouldn't come through here.
+    assert((arg->result_mod & MOD_CENTROID) == 0);
+
+    if (ctx->predicated)
+    {
+        fail(ctx, "predicated destinations unsupported");  // !!! FIXME
+        *buf = '\0';
+        return buf;
+    } // if
+
+    char operation[256];
+    va_list ap;
+    va_start(ap, fmt);
+    const int len = vsnprintf(operation, sizeof (operation), fmt, ap);
+    va_end(ap);
+    if (len >= sizeof (operation))
+    {
+        fail(ctx, "operation string too large");  // I'm lazy.  :P
+        *buf = '\0';
+        return buf;
+    } // if
+
+    const char *result_shift_str = "";
+    switch (arg->result_shift)
+    {
+        case 0x1: result_shift_str = " * 2.0"; break;
+        case 0x2: result_shift_str = " * 4.0"; break;
+        case 0x3: result_shift_str = " * 8.0"; break;
+        case 0xD: result_shift_str = " / 8.0"; break;
+        case 0xE: result_shift_str = " / 4.0"; break;
+        case 0xF: result_shift_str = " / 2.0"; break;
+    } // switch
+    need_parens |= (result_shift_str[0] != '\0');
+
+    char regnum_str[16];
+    const char *regtype_str = get_HLSL_register_string(ctx, arg->regtype,
+                                                       arg->regnum, regnum_str,
+                                                       sizeof (regnum_str));
+    char writemask_str[6];
+    size_t i = 0;
+    const int scalar = isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum);
+    if (!scalar && !writemask_xyzw(arg->writemask))
+    {
+        writemask_str[i++] = '.';
+        if (arg->writemask0) writemask_str[i++] = 'x';
+        if (arg->writemask1) writemask_str[i++] = 'y';
+        if (arg->writemask2) writemask_str[i++] = 'z';
+        if (arg->writemask3) writemask_str[i++] = 'w';
+    } // if
+    writemask_str[i] = '\0';
+    assert(i < sizeof (writemask_str));
+
+    const char *leftparen = (need_parens) ? "(" : "";
+    const char *rightparen = (need_parens) ? ")" : "";
+
+    snprintf(buf, buflen, "%s%s%s = %s%s%s%s%s%s;", regtype_str,
+             regnum_str, writemask_str,clampleft, leftparen,
+             operation, rightparen, result_shift_str, clampright);
+    // !!! FIXME: make sure the scratch buffer was large enough.
+    return buf;
+} // make_HLSL_destarg_assign
+
+
+char *make_HLSL_swizzle_string(char *swiz_str, const size_t strsize,
+                               const int swizzle, const int writemask)
+{
+    size_t i = 0;
+    if ( (!no_swizzle(swizzle)) || (!writemask_xyzw(writemask)) )
+    {
+        const int writemask0 = (writemask >> 0) & 0x1;
+        const int writemask1 = (writemask >> 1) & 0x1;
+        const int writemask2 = (writemask >> 2) & 0x1;
+        const int writemask3 = (writemask >> 3) & 0x1;
+
+        const int swizzle_x = (swizzle >> 0) & 0x3;
+        const int swizzle_y = (swizzle >> 2) & 0x3;
+        const int swizzle_z = (swizzle >> 4) & 0x3;
+        const int swizzle_w = (swizzle >> 6) & 0x3;
+
+        swiz_str[i++] = '.';
+        if (writemask0) swiz_str[i++] = swizzle_channels[swizzle_x];
+        if (writemask1) swiz_str[i++] = swizzle_channels[swizzle_y];
+        if (writemask2) swiz_str[i++] = swizzle_channels[swizzle_z];
+        if (writemask3) swiz_str[i++] = swizzle_channels[swizzle_w];
+    } // if
+    assert(i < strsize);
+    swiz_str[i] = '\0';
+    return swiz_str;
+} // make_HLSL_swizzle_string
+
+
+const char *make_HLSL_srcarg_string(Context *ctx, const size_t idx,
+                                    const int writemask, char *buf,
+                                    const size_t buflen)
+{
+    *buf = '\0';
+
+    if (idx >= STATICARRAYLEN(ctx->source_args))
+    {
+        fail(ctx, "Too many source args");
+        return buf;
+    } // if
+
+    const SourceArgInfo *arg = &ctx->source_args[idx];
+
+    const char *premod_str = "";
+    const char *postmod_str = "";
+    switch (arg->src_mod)
+    {
+        case SRCMOD_NEGATE:
+            premod_str = "-";
+            break;
+
+        case SRCMOD_BIASNEGATE:
+            premod_str = "-(";
+            postmod_str = " - 0.5)";
+            break;
+
+        case SRCMOD_BIAS:
+            premod_str = "(";
+            postmod_str = " - 0.5)";
+            break;
+
+        case SRCMOD_SIGNNEGATE:
+            premod_str = "-((";
+            postmod_str = " - 0.5) * 2.0)";
+            break;
+
+        case SRCMOD_SIGN:
+            premod_str = "((";
+            postmod_str = " - 0.5) * 2.0)";
+            break;
+
+        case SRCMOD_COMPLEMENT:
+            premod_str = "(1.0 - ";
+            postmod_str = ")";
+            break;
+
+        case SRCMOD_X2NEGATE:
+            premod_str = "-(";
+            postmod_str = " * 2.0)";
+            break;
+
+        case SRCMOD_X2:
+            premod_str = "(";
+            postmod_str = " * 2.0)";
+            break;
+
+        case SRCMOD_DZ:
+            fail(ctx, "SRCMOD_DZ unsupported"); return buf; // !!! FIXME
+            postmod_str = "_dz";
+            break;
+
+        case SRCMOD_DW:
+            fail(ctx, "SRCMOD_DW unsupported"); return buf; // !!! FIXME
+            postmod_str = "_dw";
+            break;
+
+        case SRCMOD_ABSNEGATE:
+            premod_str = "-abs(";
+            postmod_str = ")";
+            break;
+
+        case SRCMOD_ABS:
+            premod_str = "abs(";
+            postmod_str = ")";
+            break;
+
+        case SRCMOD_NOT:
+            premod_str = "!";
+            break;
+
+        case SRCMOD_NONE:
+        case SRCMOD_TOTAL:
+             break;  // stop compiler whining.
+    } // switch
+
+    const char *regtype_str = NULL;
+
+    if (!arg->relative)
+    {
+        regtype_str = get_HLSL_varname_in_buf(ctx, arg->regtype, arg->regnum,
+                                              (char *) alloca(64), 64);
+    } // if
+
+    const char *rel_lbracket = "";
+    char rel_offset[32] = { '\0' };
+    const char *rel_rbracket = "";
+    char rel_swizzle[4] = { '\0' };
+    const char *rel_regtype_str = "";
+    if (arg->relative)
+    {
+        if (arg->regtype == REG_TYPE_INPUT)
+            regtype_str=get_HLSL_input_array_varname(ctx,(char*)alloca(64),64);
+        else
+        {
+            assert(arg->regtype == REG_TYPE_CONST);
+            const int arrayidx = arg->relative_array->index;
+            const int offset = arg->regnum - arrayidx;
+            assert(offset >= 0);
+            if (arg->relative_array->constant)
+            {
+                const int arraysize = arg->relative_array->count;
+                regtype_str = get_HLSL_const_array_varname_in_buf(ctx,
+                                arrayidx, arraysize, (char *) alloca(64), 64);
+                if (offset != 0)
+                    snprintf(rel_offset, sizeof (rel_offset), "%d + ", offset);
+            } // if
+            else
+            {
+                regtype_str = get_HLSL_uniform_array_varname(ctx, arg->regtype,
+                                                      (char *) alloca(64), 64);
+                if (offset == 0)
+                {
+                    snprintf(rel_offset, sizeof (rel_offset),
+                             "ARRAYBASE_%d + ", arrayidx);
+                } // if
+                else
+                {
+                    snprintf(rel_offset, sizeof (rel_offset),
+                             "(ARRAYBASE_%d + %d) + ", arrayidx, offset);
+                } // else
+            } // else
+        } // else
+
+        rel_lbracket = "[";
+
+        rel_regtype_str = get_HLSL_varname_in_buf(ctx, arg->relative_regtype,
+                                                    arg->relative_regnum,
+                                                    (char *) alloca(64), 64);
+        rel_swizzle[0] = '.';
+        rel_swizzle[1] = swizzle_channels[arg->relative_component];
+        rel_swizzle[2] = '\0';
+        rel_rbracket = "]";
+    } // if
+
+    char swiz_str[6] = { '\0' };
+    if (!isscalar(ctx, ctx->shader_type, arg->regtype, arg->regnum))
+    {
+        make_HLSL_swizzle_string(swiz_str, sizeof (swiz_str),
+                                 arg->swizzle, writemask);
+    } // if
+
+    if (regtype_str == NULL)
+    {
+        fail(ctx, "Unknown source register type.");
+        return buf;
+    } // if
+
+    snprintf(buf, buflen, "%s%s%s%s%s%s%s%s%s",
+             premod_str, regtype_str, rel_lbracket, rel_offset,
+             rel_regtype_str, rel_swizzle, rel_rbracket, swiz_str,
+             postmod_str);
+    // !!! FIXME: make sure the scratch buffer was large enough.
+    return buf;
+} // make_HLSL_srcarg_string
+
+// generate some convenience functions.
+#define MAKE_HLSL_SRCARG_STRING_(mask, bitmask) \
+    static inline const char *make_HLSL_srcarg_string_##mask(Context *ctx, \
+                                                const size_t idx, char *buf, \
+                                                const size_t buflen) { \
+        return make_HLSL_srcarg_string(ctx, idx, bitmask, buf, buflen); \
+    }
+MAKE_HLSL_SRCARG_STRING_(x, (1 << 0))
+MAKE_HLSL_SRCARG_STRING_(y, (1 << 1))
+MAKE_HLSL_SRCARG_STRING_(z, (1 << 2))
+MAKE_HLSL_SRCARG_STRING_(w, (1 << 3))
+MAKE_HLSL_SRCARG_STRING_(scalar, (1 << 0))
+MAKE_HLSL_SRCARG_STRING_(full, 0xF)
+MAKE_HLSL_SRCARG_STRING_(masked, ctx->dest_arg.writemask)
+MAKE_HLSL_SRCARG_STRING_(vec3, 0x7)
+MAKE_HLSL_SRCARG_STRING_(vec2, 0x3)
+#undef MAKE_HLSL_SRCARG_STRING_
+
+// special cases for comparison opcodes...
+
+const char *get_HLSL_comparison_string_scalar(Context *ctx)
+{
+    const char *comps[] = { "", ">", "==", ">=", "<", "!=", "<=" };
+    if (ctx->instruction_controls >= STATICARRAYLEN(comps))
+    {
+        fail(ctx, "unknown comparison control");
+        return "";
+    } // if
+
+    return comps[ctx->instruction_controls];
+} // get_HLSL_comparison_string_scalar
+
+const char *get_HLSL_comparison_string_vector(Context *ctx)
+{
+    return get_HLSL_comparison_string_scalar(ctx); // standard C operators work for vectors in HLSL.
+} // get_HLSL_comparison_string_vector
+
+
+void emit_HLSL_start(Context *ctx, const char *profilestr)
+{
+    if (!shader_is_vertex(ctx) && !shader_is_pixel(ctx))
+    {
+        failf(ctx, "Shader type %u unsupported in this profile.",
+              (uint) ctx->shader_type);
+        return;
+    } // if
+
+    if (!ctx->mainfn)
+    {
+        if (shader_is_vertex(ctx))
+            ctx->mainfn = StrDup(ctx, "VertexShader");
+        else if (shader_is_pixel(ctx))
+            ctx->mainfn = StrDup(ctx, "PixelShader");
+    } // if
+
+    set_output(ctx, &ctx->mainline);
+    ctx->indent++;
+} // emit_HLSL_start
+
+void emit_HLSL_RET(Context *ctx);
+void emit_HLSL_end(Context *ctx)
+{
+    // !!! FIXME: maybe handle this at a higher level?
+    // ps_1_* writes color to r0 instead oC0. We move it to the right place.
+    // We don't have to worry about a RET opcode messing this up, since
+    //  RET isn't available before ps_2_0.
+    if (shader_is_pixel(ctx) && !shader_version_atleast(ctx, 2, 0))
+    {
+        set_used_register(ctx, REG_TYPE_COLOROUT, 0, 1);
+        output_line(ctx, "oC0 = r0;");
+    } // if
+
+    // !!! FIXME: maybe handle this at a higher level?
+    // force a RET opcode if we're at the end of the stream without one.
+    if (ctx->previous_opcode != OPCODE_RET)
+        emit_HLSL_RET(ctx);
+} // emit_HLSL_end
+
+void emit_HLSL_phase(Context *ctx)
+{
+    // no-op in HLSL.
+} // emit_HLSL_phase
+
+void output_HLSL_uniform_array(Context *ctx, const RegisterType regtype,
+                               const int size)
+{
+    if (size > 0)
+    {
+        char buf[64];
+        get_HLSL_uniform_array_varname(ctx, regtype, buf, sizeof (buf));
+        const char *typ;
+        switch (regtype)
+        {
+            case REG_TYPE_CONST: typ = "float4"; break;
+            case REG_TYPE_CONSTINT: typ = "int4"; break;
+            case REG_TYPE_CONSTBOOL: typ = "bool"; break;
+            default:
+            {
+                fail(ctx, "BUG: used a uniform we don't know how to define.");
+                return;
+            } // default
+        } // switch
+        output_line(ctx, "%s %s[%d];", typ, buf, size);
+    } // if
+} // output_HLSL_uniform_array
+
+void emit_HLSL_finalize(Context *ctx)
+{
+    if (ctx->have_relative_input_registers) // !!! FIXME
+        fail(ctx, "Relative addressing of input registers not supported.");
+
+    // Check uniform_float4_count too since TEXBEM affects it
+    if (ctx->uniform_count > 0 || ctx->uniform_float4_count > 0)
+    {
+        push_output(ctx, &ctx->preflight);
+        output_line(ctx, "cbuffer %s_Uniforms : register(b0)", ctx->mainfn);
+        output_line(ctx, "{");
+        ctx->indent++;
+        output_HLSL_uniform_array(ctx, REG_TYPE_CONST, ctx->uniform_float4_count);
+        output_HLSL_uniform_array(ctx, REG_TYPE_CONSTINT, ctx->uniform_int4_count);
+        output_HLSL_uniform_array(ctx, REG_TYPE_CONSTBOOL, ctx->uniform_bool_count);
+        ctx->indent--;
+        output_line(ctx, "};");
+        output_blank_line(ctx);
+        pop_output(ctx);
+    } // if
+
+    // Fill in the shader's mainline function signature.
+    push_output(ctx, &ctx->mainline_intro);
+    output_line(ctx, "%s%s %s(%s%s%s)",
+                ctx->outputs ? ctx->mainfn : "void",
+                ctx->outputs ? "_Output" : "",
+                ctx->mainfn,
+                ctx->inputs ? ctx->mainfn : "",
+                ctx->inputs ? "_Input" : "",
+                ctx->inputs ? " input" : "");
+    output_line(ctx, "{");
+
+    if (ctx->outputs)
+    {
+        ctx->indent++;
+        output_line(ctx, "%s%s output = (%s%s) 0;",
+                    ctx->mainfn, "_Output", ctx->mainfn, "_Output");
+
+        push_output(ctx, &ctx->mainline);
+        ctx->indent++;
+        output_line(ctx, "return output;");
+        pop_output(ctx);
+    } // if
+    pop_output(ctx);
+
+    if (ctx->inputs)
+    {
+        push_output(ctx, &ctx->inputs);
+        output_line(ctx, "};");
+        output_blank_line(ctx);
+        pop_output(ctx);
+    } // if
+
+    if (ctx->outputs)
+    {
+        push_output(ctx, &ctx->outputs);
+
+        // !!! FIXME: Maybe have a better check for this?
+        if (ctx->hlsl_outpos_name[0] != '\0')
+        {
+            output_line(ctx, "\tfloat4 m_%s : SV_Position;",
+                        ctx->hlsl_outpos_name);
+        } // if
+
+        output_line(ctx, "};");
+        output_blank_line(ctx);
+        pop_output(ctx);
+    } // if
+
+    // throw some blank lines around to make source more readable.
+    if (ctx->globals)  // don't add a blank line if the section is empty.
+    {
+        push_output(ctx, &ctx->globals);
+        output_blank_line(ctx);
+        pop_output(ctx);
+    } // if
+
+    if (ctx->need_max_float)
+    {
+        push_output(ctx, &ctx->mainline_top);
+        ctx->indent++;
+        output_line(ctx, "#define FLT_MAX 1e38");
+        ctx->indent--;
+        pop_output(ctx);
+    } // if
+} // emit_HLSL_finalize
+
+void emit_HLSL_global(Context *ctx, RegisterType regtype, int regnum)
+{
+    char varname[64];
+    get_HLSL_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));
+
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+
+    switch (regtype)
+    {
+        case REG_TYPE_ADDRESS:
+            if (shader_is_vertex(ctx))
+                output_line(ctx, "int4 %s;", varname);
+            else if (shader_is_pixel(ctx))  // actually REG_TYPE_TEXTURE.
+            {
+                // We have to map texture registers to temps for ps_1_1, since
+                //  they work like temps, initialize with tex coords, and the
+                //  ps_1_1 TEX opcode expects to overwrite it.
+                if (!shader_version_atleast(ctx, 1, 4))
+                    output_line(ctx, "float4 %s = input.m_%s;",varname,varname);
+            } // else if
+            break;
+        case REG_TYPE_PREDICATE:
+            output_line(ctx, "bool4 %s;", varname);
+            break;
+        case REG_TYPE_TEMP:
+            output_line(ctx, "float4 %s;", varname);
+            break;
+        case REG_TYPE_LOOP:
+            break; // no-op. We declare these in for loops at the moment.
+        case REG_TYPE_LABEL:
+            break; // no-op. If we see it here, it means we optimized it out.
+        default:
+            fail(ctx, "BUG: we used a register we don't know how to define.");
+            break;
+    } // switch
+
+    pop_output(ctx);
+} // emit_HLSL_global
+
+void emit_HLSL_array(Context *ctx, VariableList *var)
+{
+    // All uniforms (except constant arrays, which are literally constant
+    //  data embedded in HLSL shaders) are now packed into a single array,
+    //  so we can batch the uniform transfers. So this doesn't actually
+    //  define an array here; the one, big array is emitted during
+    //  finalization instead.
+    // However, we need to #define the offset into the one, big array here,
+    //  and let dereferences use that #define.
+    const int base = var->index;
+    const int hlslbase = ctx->uniform_float4_count;
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+    output_line(ctx, "const int ARRAYBASE_%d = %d;", base, hlslbase);
+    pop_output(ctx);
+    var->emit_position = hlslbase;
+} // emit_HLSL_array
+
+void emit_HLSL_const_array(Context *ctx, const ConstantsList *clist,
+                           int base, int size)
+{
+    char varname[64];
+    get_HLSL_const_array_varname_in_buf(ctx,base,size,varname,sizeof(varname));
+
+    const char *cstr = NULL;
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+    output_line(ctx, "const float4 %s[%d] = {", varname, size);
+    ctx->indent++;
+
+    int i;
+    for (i = 0; i < size; i++)
+    {
+        while (clist->constant.type != MOJOSHADER_UNIFORM_FLOAT)
+            clist = clist->next;
+        assert(clist->constant.index == (base + i));
+
+        char val0[32];
+        char val1[32];
+        char val2[32];
+        char val3[32];
+        floatstr(ctx, val0, sizeof (val0), clist->constant.value.f[0], 1);
+        floatstr(ctx, val1, sizeof (val1), clist->constant.value.f[1], 1);
+        floatstr(ctx, val2, sizeof (val2), clist->constant.value.f[2], 1);
+        floatstr(ctx, val3, sizeof (val3), clist->constant.value.f[3], 1);
+
+        output_line(ctx, "float4(%s, %s, %s, %s)%s", val0, val1, val2, val3,
+                        (i < (size-1)) ? "," : "");
+
+        clist = clist->next;
+    } // for
+
+    ctx->indent--;
+    output_line(ctx, "};");
+    pop_output(ctx);
+} // emit_HLSL_const_array
+
+void emit_HLSL_uniform(Context *ctx, RegisterType regtype, int regnum,
+                       const VariableList *var)
+{
+    // Now that we're pushing all the uniforms as one big array, pack these
+    //  down, so if we only use register c439, it'll actually map to
+    //  HLSL_uniforms_vec4[0]. As we push one big array, this will prevent
+    //  uploading unused data.
+
+    char varname[64];
+    char name[64];
+    int index = 0;
+
+    get_HLSL_varname_in_buf(ctx, regtype, regnum, varname, sizeof (varname));
+
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+
+    if (var == NULL)
+    {
+        get_HLSL_uniform_array_varname(ctx, regtype, name, sizeof (name));
+
+        if (regtype == REG_TYPE_CONST)
+            index = ctx->uniform_float4_count;
+        else if (regtype == REG_TYPE_CONSTINT)
+            index = ctx->uniform_int4_count;
+        else if (regtype == REG_TYPE_CONSTBOOL)
+            index = ctx->uniform_bool_count;
+        else  // get_HLSL_uniform_array_varname() would have called fail().
+            assert(!(ctx->isfail));
+
+        output_line(ctx, "#define %s %s[%d]", varname, name, index);
+        push_output(ctx, &ctx->mainline);
+        ctx->indent++;
+        output_line(ctx, "#undef %s", varname);  // !!! FIXME: gross.
+        pop_output(ctx);
+    } // if
+
+    else
+    {
+        const int arraybase = var->index;
+        if (var->constant)
+        {
+            get_HLSL_const_array_varname_in_buf(ctx, arraybase, var->count,
+                                                name, sizeof (name));
+            index = (regnum - arraybase);
+        } // if
+        else
+        {
+            assert(var->emit_position != -1);
+            get_HLSL_uniform_array_varname(ctx, regtype, name, sizeof (name));
+            index = (regnum - arraybase) + var->emit_position;
+        } // else
+
+        output_line(ctx, "#define %s %s[%d];", varname, name, index);
+        push_output(ctx, &ctx->mainline);
+        ctx->indent++;
+        output_line(ctx, "#undef %s", varname);  // !!! FIXME: gross.
+        pop_output(ctx);
+    } // else
+
+    pop_output(ctx);
+} // emit_HLSL_uniform
+
+void emit_HLSL_sampler(Context *ctx,int stage,TextureType ttype,int tb)
+{
+    char var[64];
+    const char *texsuffix = NULL;
+    switch (ttype)
+    {
+        case TEXTURE_TYPE_2D: texsuffix = "2D"; break;
+        case TEXTURE_TYPE_CUBE: texsuffix = "Cube"; break;
+        case TEXTURE_TYPE_VOLUME: texsuffix = "3D"; break;
+        default: assert(!"unexpected texture type"); return;
+    } // switch
+
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, stage, var, sizeof(var));
+
+    push_output(ctx, &ctx->globals);
+    output_line(ctx, "Texture%s %s_texture : register(t%d);", texsuffix, var, stage);
+    output_line(ctx, "SamplerState %s : register(%s);", var, var);
+    pop_output(ctx);
+
+    if (tb)  // This sampler used a ps_1_1 TEXBEM opcode?
+    {
+        push_output(ctx, &ctx->mainline_top);
+        ctx->indent++;
+        char name[64];
+        const int index = ctx->uniform_float4_count;
+        ctx->uniform_float4_count += 2;
+        get_HLSL_uniform_array_varname(ctx, REG_TYPE_CONST, name, sizeof(name));
+        output_line(ctx, "const float4 %s_texbem = %s[%d];", var, name, index);
+        output_line(ctx, "const float4 %s_texbeml = %s[%d];", var, name, index + 1);
+        pop_output(ctx);
+    } // if
+} // emit_HLSL_sampler
+
+
+void emit_HLSL_attribute(Context *ctx, RegisterType regtype, int regnum,
+                         MOJOSHADER_usage usage, int index, int wmask,
+                         int flags)
+{
+    // !!! FIXME: this function doesn't deal with write masks at all yet!
+    const char *usage_str = NULL;
+    char index_str[16] = { '\0' };
+    char var[64];
+    char a[256];
+
+    get_HLSL_varname_in_buf(ctx, regtype, regnum, var, sizeof (var));
+
+    //assert((flags & MOD_PP) == 0);  // !!! FIXME: is PP allowed?
+
+    if (index != 0)  // !!! FIXME: a lot of these MUST be zero.
+        snprintf(index_str, sizeof (index_str), "%u", (uint) index);
+
+    if (shader_is_vertex(ctx))
+    {
+        // pre-vs3 output registers.
+        // these don't ever happen in DCL opcodes, I think. Map to vs_3_*
+        //  output registers.
+        if (!shader_version_atleast(ctx, 3, 0))
+        {
+            if (regtype == REG_TYPE_RASTOUT)
+            {
+                regtype = REG_TYPE_OUTPUT;
+                index = regnum;
+                switch ((const RastOutType) regnum)
+                {
+                    case RASTOUT_TYPE_POSITION:
+                        usage = MOJOSHADER_USAGE_POSITION;
+                        break;
+                    case RASTOUT_TYPE_FOG:
+                        usage = MOJOSHADER_USAGE_FOG;
+                        break;
+                    case RASTOUT_TYPE_POINT_SIZE:
+                        usage = MOJOSHADER_USAGE_POINTSIZE;
+                        break;
+                } // switch
+            } // if
+
+            else if (regtype == REG_TYPE_ATTROUT)
+            {
+                regtype = REG_TYPE_OUTPUT;
+                usage = MOJOSHADER_USAGE_COLOR;
+                index = regnum;
+            } // else if
+
+            else if (regtype == REG_TYPE_TEXCRDOUT)
+            {
+                regtype = REG_TYPE_OUTPUT;
+                usage = MOJOSHADER_USAGE_TEXCOORD;
+                index = regnum;
+            } // else if
+        } // if
+
+        if (regtype == REG_TYPE_INPUT)
+        {
+            push_output(ctx, &ctx->inputs);
+            if (buffer_size(ctx->inputs) == 0)
+            {
+                output_line(ctx, "struct %s_Input", ctx->mainfn);
+                output_line(ctx, "{");
+            } // if
+
+            ctx->indent++;
+            switch (usage)
+            {
+                case MOJOSHADER_USAGE_BINORMAL:
+                    output_line(ctx, "float4 m_%s : BINORMAL%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_BLENDINDICES:
+                    output_line(ctx, "float4 m_%s : BLENDINDICES%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_BLENDWEIGHT:
+                    output_line(ctx, "float4 m_%s : BLENDWEIGHT%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_COLOR:
+                    output_line(ctx, "float4 m_%s : COLOR%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_NORMAL:
+                    output_line(ctx, "float4 m_%s : NORMAL%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_POSITION:
+                    output_line(ctx, "float4 m_%s : POSITION%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_POSITIONT:
+                    output_line(ctx, "float4 m_%s : POSITIONT;", var);
+                    break;
+                case MOJOSHADER_USAGE_POINTSIZE:
+                    output_line(ctx, "float4 m_%s : PSIZE;", var);
+                    break;
+                case MOJOSHADER_USAGE_TANGENT:
+                    output_line(ctx, "float4 m_%s : TANGENT%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_TEXCOORD:
+                    output_line(ctx, "float4 m_%s : TEXCOORD%d;", var, index);
+                    break;
+                default:
+                    fail(ctx, "Unknown vertex input semantic type!");
+                    break;
+            } // case
+            pop_output(ctx);
+
+            push_output(ctx, &ctx->mainline_top);
+            ctx->indent++;
+            output_line(ctx, "#define %s input.m_%s", var, var);
+            pop_output(ctx);
+            push_output(ctx, &ctx->mainline);
+            ctx->indent++;
+            output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
+            pop_output(ctx);
+        } // if
+
+        else if (regtype == REG_TYPE_OUTPUT)
+        {
+            push_output(ctx, &ctx->outputs);
+            if (buffer_size(ctx->outputs) == 0)
+            {
+                output_line(ctx, "struct %s_Output", ctx->mainfn);
+                output_line(ctx, "{");
+            } // if
+
+            ctx->indent++;
+
+            switch (usage)
+            {
+                case MOJOSHADER_USAGE_BINORMAL:
+                    output_line(ctx, "float4 m_%s : BINORMAL%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_BLENDINDICES:
+                    output_line(ctx, "float4 m_%s : BLENDINDICES%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_BLENDWEIGHT:
+                    output_line(ctx, "float4 m_%s : BLENDWEIGHT%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_COLOR:
+                    output_line(ctx, "float4 m_%s : COLOR%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_FOG:
+                    output_line(ctx, "float m_%s : FOG;", var);
+                    break;
+                case MOJOSHADER_USAGE_NORMAL:
+                    output_line(ctx, "float4 m_%s : NORMAL%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_POSITION:
+                    if (index == 0)
+                        snprintf(ctx->hlsl_outpos_name,
+                                 sizeof(ctx->hlsl_outpos_name), "%s", var);
+                    else
+                        output_line(ctx, "float4 m_%s : POSITION%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_POSITIONT:
+                    output_line(ctx, "float4 m_%s : POSITIONT;", var);
+                    break;
+                case MOJOSHADER_USAGE_POINTSIZE:
+                    output_line(ctx, "float m_%s : PSIZE;", var);
+                    break;
+                case MOJOSHADER_USAGE_TANGENT:
+                    output_line(ctx, "float4 m_%s : TANGENT%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_TESSFACTOR:
+                    output_line(ctx, "float m_%s : TESSFACTOR%d;", var, index);
+                    break;
+                case MOJOSHADER_USAGE_TEXCOORD:
+                    output_line(ctx, "float4 m_%s : TEXCOORD%d;", var, index);
+                    break;
+                default:
+                    snprintf(a, sizeof(a), "Invalid vertex output semantic %d", usage);
+                    fail(ctx, a);
+                    break;
+            } // switch
+
+            pop_output(ctx);
+
+            push_output(ctx, &ctx->mainline_top);
+            ctx->indent++;
+            output_line(ctx, "#define %s output.m_%s", var, var);
+            pop_output(ctx);
+            push_output(ctx, &ctx->mainline);
+            ctx->indent++;
+            output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
+            pop_output(ctx);
+        } // else if
+
+        else
+        {
+            fail(ctx, "unknown vertex shader attribute register");
+        } // else
+    } // if
+
+    else if (shader_is_pixel(ctx))
+    {
+        // samplers DCLs get handled in emit_HLSL_sampler().
+
+        if (flags & MOD_CENTROID)  // !!! FIXME
+        {
+            failf(ctx, "centroid unsupported in %s profile", ctx->profile->name);
+            return;
+        } // if
+
+        if ((regtype == REG_TYPE_COLOROUT) || (regtype == REG_TYPE_DEPTHOUT))
+        {
+            push_output(ctx, &ctx->outputs);
+            if (buffer_size(ctx->outputs) == 0)
+            {
+                output_line(ctx, "struct %s_Output", ctx->mainfn);
+                output_line(ctx, "{");
+            } // if
+            ctx->indent++;
+
+            if (regtype == REG_TYPE_COLOROUT)
+                output_line(ctx, "float4 m_%s : SV_Target%d;", var, regnum);
+            else if (regtype == REG_TYPE_DEPTHOUT)
+                output_line(ctx, "float m_%s : SV_Depth;", var);
+
+            pop_output(ctx);
+
+            push_output(ctx, &ctx->mainline_top);
+            ctx->indent++;
+            output_line(ctx, "#define %s output.m_%s", var, var);
+            pop_output(ctx);
+            push_output(ctx, &ctx->mainline);
+            ctx->indent++;
+            output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
+            pop_output(ctx);
+        } // if
+
+        // !!! FIXME: can you actualy have a texture register with COLOR usage?
+        else if ((regtype == REG_TYPE_TEXTURE) ||
+                 (regtype == REG_TYPE_INPUT) ||
+                 (regtype == REG_TYPE_MISCTYPE))
+        {
+            int skipreference = 0;
+            const char *define_start = "";
+            const char *define_end = "";
+
+            push_output(ctx, &ctx->inputs);
+            if (buffer_size(ctx->inputs) == 0)
+            {
+                output_line(ctx, "struct %s_Input", ctx->mainfn);
+                output_line(ctx, "{");
+                output_line(ctx, "\t// This must match the vertex output!");
+                output_line(ctx, "\t// Rewrite at link time if needed!");
+            } // if
+            ctx->indent++;
+
+            if (regtype == REG_TYPE_MISCTYPE)
+            {
+                const MiscTypeType mt = (MiscTypeType) regnum;
+                if (mt == MISCTYPE_TYPE_FACE)
+                {
+                    // In SM 3.0, VFACE was a float whose sign determined
+                    //  face direction. In SM 4.0+, it's just a bool, so
+                    //  we convert the value when we output the #define.
+                    output_line(ctx, "bool m_%s : SV_IsFrontFace;", var);
+                    define_start = "(";
+                    define_end = " ? 1 : -1)";
+                } // if
+                else if (mt == MISCTYPE_TYPE_POSITION)
+                    output_line(ctx, "float4 m_%s : SV_Position;", var);
+                else
+                    fail(ctx, "BUG: unhandled misc register");
+            } // else if
+
+            else
+            {
+                if (usage == MOJOSHADER_USAGE_TEXCOORD)
+                {
+                    // ps_1_1 does a different hack for this attribute.
+                    //  Refer to emit_HLSL_global()'s REG_TYPE_ADDRESS code.
+                    if (!shader_version_atleast(ctx, 1, 4))
+                        skipreference = 1;
+                    output_line(ctx, "float4 m_%s : TEXCOORD%d;", var, index);
+                } // if
+
+                else if (usage == MOJOSHADER_USAGE_COLOR)
+                    output_line(ctx, "float4 m_%s : COLOR%d;", var, index);
+
+                else if (usage == MOJOSHADER_USAGE_FOG)
+                    output_line(ctx, "float m_%s : FOG;", var);
+
+                else if (usage == MOJOSHADER_USAGE_NORMAL)
+                    output_line(ctx, "float4 m_%s : NORMAL;", var);
+            } // else
+
+            pop_output(ctx);
+
+            if (!skipreference)
+            {
+                push_output(ctx, &ctx->mainline_top);
+                ctx->indent++;
+                output_line(ctx, "#define %s %sinput.m_%s%s", var,
+                            define_start, var, define_end);
+                pop_output(ctx);
+                push_output(ctx, &ctx->mainline);
+                ctx->indent++;
+                output_line(ctx, "#undef %s", var);  // !!! FIXME: gross.
+                pop_output(ctx);
+            } // if
+        } // else if
+
+        else
+        {
+            fail(ctx, "unknown pixel shader attribute register");
+        } // else
+    } // else if
+
+    else
+    {
+        fail(ctx, "Unknown shader type");  // state machine should catch this.
+    } // else
+} // emit_HLSL_attribute
+
+void emit_HLSL_NOP(Context *ctx)
+{
+    // no-op is a no-op.  :)
+} // emit_HLSL_NOP
+
+void emit_HLSL_MOV(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "%s", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_MOV
+
+void emit_HLSL_ADD(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "%s + %s", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_ADD
+
+void emit_HLSL_SUB(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "%s - %s", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_SUB
+
+void emit_HLSL_MAD(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char src2[64]; make_HLSL_srcarg_string_masked(ctx, 2, src2, sizeof (src2));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "(%s * %s) + %s", src0, src1, src2);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_MAD
+
+void emit_HLSL_MUL(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "%s * %s", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_MUL
+
+void emit_HLSL_RCP(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    ctx->need_max_float = 1;
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "(%s == 0.0) ? FLT_MAX : 1.0 / %s", src0, src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_RCP
+
+void emit_HLSL_RSQ(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    ctx->need_max_float = 1;
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "(%s == 0.0) ? FLT_MAX : rsqrt(abs(%s))",
+                             src0, src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_RSQ
+
+void emit_HLSL_dotprod(Context *ctx, const char *src0, const char *src1,
+                       const char *extra)
+{
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "dot(%s, %s)%s",
+                             src0, src1, extra);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_dotprod
+
+void emit_HLSL_DP3(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_vec3(ctx, 1, src1, sizeof (src1));
+    emit_HLSL_dotprod(ctx, src0, src1, "");
+} // emit_HLSL_DP3
+
+void emit_HLSL_DP4(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_full(ctx, 1, src1, sizeof (src1));
+    emit_HLSL_dotprod(ctx, src0, src1, "");
+} // emit_HLSL_DP4
+
+void emit_HLSL_MIN(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "min(%s, %s)", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_MIN
+
+void emit_HLSL_MAX(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "max(%s, %s)", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_MAX
+
+void emit_HLSL_SLT(Context *ctx)
+{
+    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+
+    // float(bool) results in 0.0 or 1.0, like SLT wants.
+    if (vecsize == 1)
+        make_HLSL_destarg_assign(ctx, code, sizeof (code), "float(%s < %s)", src0, src1);
+    else
+        make_HLSL_destarg_assign(ctx, code, sizeof (code), "%s < %s", src0, src1);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_SLT
+
+void emit_HLSL_SGE(Context *ctx)
+{
+    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+
+    // float(bool) results in 0.0 or 1.0, like SGE wants.
+    if (vecsize == 1)
+        make_HLSL_destarg_assign(ctx, code, sizeof (code), "float(%s >= %s)", src0, src1);
+    else
+        make_HLSL_destarg_assign(ctx, code, sizeof (code), "%s >= %s", src0, src1);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_SGE
+
+void emit_HLSL_EXP(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "exp2(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_EXP
+
+void emit_HLSL_LOG(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "log2(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_LOG
+
+void emit_HLSL_LIT(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    const char *maxp = "127.9961"; // value from the dx9 reference.
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "lit(%s.x, %s.y, clamp(%s.w, -%s, %s))",
+                             src0, src0, src0, maxp, maxp);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_LIT
+
+void emit_HLSL_DST(Context *ctx)
+{
+    // !!! FIXME: needs to take ctx->dst_arg.writemask into account
+    // !!! FIXME: can we use dst() intrinsic instead? -caleb
+    char src0_y[64]; make_HLSL_srcarg_string_y(ctx, 0, src0_y, sizeof (src0_y));
+    char src1_y[64]; make_HLSL_srcarg_string_y(ctx, 1, src1_y, sizeof (src1_y));
+    char src0_z[64]; make_HLSL_srcarg_string_z(ctx, 0, src0_z, sizeof (src0_z));
+    char src1_w[64]; make_HLSL_srcarg_string_w(ctx, 1, src1_w, sizeof (src1_w));
+
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "float4(1.0, %s * %s, %s, %s)",
+                             src0_y, src1_y, src0_z, src1_w);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_DST
+
+void emit_HLSL_LRP(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char src2[64]; make_HLSL_srcarg_string_masked(ctx, 2, src2, sizeof (src2));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "lerp(%s, %s, %s)",
+                             src2, src1, src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_LRP
+
+void emit_HLSL_FRC(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "frac(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_FRC
+
+void emit_HLSL_M4X4(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
+    char row0[64]; make_HLSL_srcarg_string_full(ctx, 1, row0, sizeof (row0));
+    char row1[64]; make_HLSL_srcarg_string_full(ctx, 2, row1, sizeof (row1));
+    char row2[64]; make_HLSL_srcarg_string_full(ctx, 3, row2, sizeof (row2));
+    char row3[64]; make_HLSL_srcarg_string_full(ctx, 4, row3, sizeof (row3));
+    char code[256];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                    "float4(dot(%s, %s), dot(%s, %s), dot(%s, %s), dot(%s, %s))",
+                    src0, row0, src0, row1, src0, row2, src0, row3);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_M4X4
+
+void emit_HLSL_M4X3(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
+    char row0[64]; make_HLSL_srcarg_string_full(ctx, 1, row0, sizeof (row0));
+    char row1[64]; make_HLSL_srcarg_string_full(ctx, 2, row1, sizeof (row1));
+    char row2[64]; make_HLSL_srcarg_string_full(ctx, 3, row2, sizeof (row2));
+    char code[256];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                "float3(dot(%s, %s), dot(%s, %s), dot(%s, %s))",
+                                src0, row0, src0, row1, src0, row2);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_M4X3
+
+void emit_HLSL_M3X4(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+    char row0[64]; make_HLSL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
+    char row1[64]; make_HLSL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
+    char row2[64]; make_HLSL_srcarg_string_vec3(ctx, 3, row2, sizeof (row2));
+    char row3[64]; make_HLSL_srcarg_string_vec3(ctx, 4, row3, sizeof (row3));
+
+    char code[256];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                "float4(dot(%s, %s), dot(%s, %s), "
+                                     "dot(%s, %s), dot(%s, %s))",
+                                src0, row0, src0, row1,
+                                src0, row2, src0, row3);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_M3X4
+
+void emit_HLSL_M3X3(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+    char row0[64]; make_HLSL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
+    char row1[64]; make_HLSL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
+    char row2[64]; make_HLSL_srcarg_string_vec3(ctx, 3, row2, sizeof (row2));
+    char code[256];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                "float3(dot(%s, %s), dot(%s, %s), dot(%s, %s))",
+                                src0, row0, src0, row1, src0, row2);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_M3X3
+
+void emit_HLSL_M3X2(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+    char row0[64]; make_HLSL_srcarg_string_vec3(ctx, 1, row0, sizeof (row0));
+    char row1[64]; make_HLSL_srcarg_string_vec3(ctx, 2, row1, sizeof (row1));
+
+    char code[256];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                "float2(dot(%s, %s), dot(%s, %s))",
+                                src0, row0, src0, row1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_M3X2
+
+void emit_HLSL_CALL(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    if (ctx->loops > 0)
+        output_line(ctx, "%s(aL);", src0);
+    else
+        output_line(ctx, "%s();", src0);
+} // emit_HLSL_CALL
+
+void emit_HLSL_CALLNZ(Context *ctx)
+{
+    // !!! FIXME: if src1 is a constbool that's true, we can remove the
+    // !!! FIXME:  if. If it's false, we can make this a no-op.
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+
+    if (ctx->loops > 0)
+        output_line(ctx, "if (%s) { %s(aL); }", src1, src0);
+    else
+        output_line(ctx, "if (%s) { %s(); }", src1, src0);
+} // emit_HLSL_CALLNZ
+
+void emit_HLSL_LOOP(Context *ctx)
+{
+    // !!! FIXME: swizzle?
+    char var[64]; get_HLSL_srcarg_varname(ctx, 1, var, sizeof (var));
+    assert(ctx->source_args[0].regnum == 0);  // in case they add aL1 someday.
+    output_line(ctx, "{");
+    ctx->indent++;
+    output_line(ctx, "const int aLend = %s.x + %s.y;", var, var);
+    output_line(ctx, "for (int aL = %s.y; aL < aLend; aL += %s.z) {", var, var);
+    ctx->indent++;
+} // emit_HLSL_LOOP
+
+void emit_HLSL_RET(Context *ctx)
+{
+    // thankfully, the MSDN specs say a RET _has_ to end a function...no
+    //  early returns. So if you hit one, you know you can safely close
+    //  a high-level function.
+    push_output(ctx, &ctx->postflight);
+    output_line(ctx, "}");
+    output_blank_line(ctx);
+    set_output(ctx, &ctx->subroutines);  // !!! FIXME: is this for LABEL? Maybe set it there so we don't allocate unnecessarily.
+} // emit_HLSL_RET
+
+void emit_HLSL_ENDLOOP(Context *ctx)
+{
+    ctx->indent--;
+    output_line(ctx, "}");
+    ctx->indent--;
+    output_line(ctx, "}");
+} // emit_HLSL_ENDLOOP
+
+void emit_HLSL_LABEL(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    const int label = ctx->source_args[0].regnum;
+    RegisterList *reg = reglist_find(&ctx->used_registers, REG_TYPE_LABEL, label);
+    assert(ctx->output == ctx->subroutines);  // not mainline, etc.
+    assert(ctx->indent == 0);  // we shouldn't be in the middle of a function.
+
+    // MSDN specs say CALL* has to come before the LABEL, so we know if we
+    //  can ditch the entire function here as unused.
+    if (reg == NULL)
+        set_output(ctx, &ctx->ignore);  // Func not used. Parse, but don't output.
+
+    // !!! FIXME: it would be nice if we could determine if a function is
+    // !!! FIXME:  only called once and, if so, forcibly inline it.
+
+    const char *uses_loopreg = ((reg) && (reg->misc == 1)) ? "int aL" : "";
+    output_line(ctx, "void %s(%s)", src0, uses_loopreg);
+    output_line(ctx, "{");
+    ctx->indent++;
+} // emit_HLSL_LABEL
+
+void emit_HLSL_DCL(Context *ctx)
+{
+    // no-op. We do this in our emit_attribute() and emit_uniform().
+} // emit_HLSL_DCL
+
+void emit_HLSL_POW(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "pow(abs(%s), %s)", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_POW
+
+void emit_HLSL_CRS(Context *ctx)
+{
+    // !!! FIXME: needs to take ctx->dst_arg.writemask into account.
+    char src0[64]; make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_vec3(ctx, 1, src1, sizeof (src1));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "cross(%s, %s)", src0, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_CRS
+
+void emit_HLSL_SGN(Context *ctx)
+{
+    // (we don't need the temporary registers specified for the D3D opcode.)
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "sign(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_SGN
+
+void emit_HLSL_ABS(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "abs(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_ABS
+
+void emit_HLSL_NRM(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "normalize(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_NRM
+
+void emit_HLSL_SINCOS(Context *ctx)
+{
+    // we don't care about the temp registers that <= sm2 demands; ignore them.
+    //  sm2 also talks about what components are left untouched vs. undefined,
+    //  but we just leave those all untouched with HLSL write masks (which
+    //  would fulfill the "undefined" requirement, too).
+    const int mask = ctx->dest_arg.writemask;
+    char src0[64]; make_HLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
+    char code[128] = { '\0' };
+
+    if (writemask_x(mask))
+        make_HLSL_destarg_assign(ctx, code, sizeof (code), "cos(%s)", src0);
+    else if (writemask_y(mask))
+        make_HLSL_destarg_assign(ctx, code, sizeof (code), "sin(%s)", src0);
+    else if (writemask_xy(mask))
+    {
+        make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                 "float2(cos(%s), sin(%s))", src0, src0);
+    } // else if
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_SINCOS
+
+void emit_HLSL_REP(Context *ctx)
+{
+    // !!! FIXME:
+    // msdn docs say legal loop values are 0 to 255. We can check DEFI values
+    //  at parse time, but if they are pulling a value from a uniform, do
+    //  we clamp here?
+    // !!! FIXME: swizzle is legal here, right?
+    char src0[64]; make_HLSL_srcarg_string_x(ctx, 0, src0, sizeof (src0));
+    const uint rep = (uint) ctx->reps;
+    output_line(ctx, "for (int rep%u = 0; rep%u < %s; rep%u++) {",
+                rep, rep, src0, rep);
+    ctx->indent++;
+} // emit_HLSL_REP
+
+void emit_HLSL_ENDREP(Context *ctx)
+{
+    ctx->indent--;
+    output_line(ctx, "}");
+} // emit_HLSL_ENDREP
+
+void emit_HLSL_IF(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
+    output_line(ctx, "if (%s) {", src0);
+    ctx->indent++;
+} // emit_HLSL_IF
+
+void emit_HLSL_IFC(Context *ctx)
+{
+    const char *comp = get_HLSL_comparison_string_scalar(ctx);
+    char src0[64]; make_HLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_scalar(ctx, 1, src1, sizeof (src1));
+    output_line(ctx, "if (%s %s %s) {", src0, comp, src1);
+    ctx->indent++;
+} // emit_HLSL_IFC
+
+void emit_HLSL_ELSE(Context *ctx)
+{
+    ctx->indent--;
+    output_line(ctx, "} else {");
+    ctx->indent++;
+} // emit_HLSL_ELSE
+
+void emit_HLSL_ENDIF(Context *ctx)
+{
+    ctx->indent--;
+    output_line(ctx, "}");
+} // emit_HLSL_ENDIF
+
+void emit_HLSL_BREAK(Context *ctx)
+{
+    output_line(ctx, "break;");
+} // emit_HLSL_BREAK
+
+void emit_HLSL_BREAKC(Context *ctx)
+{
+    const char *comp = get_HLSL_comparison_string_scalar(ctx);
+    char src0[64]; make_HLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_scalar(ctx, 1, src1, sizeof (src1));
+    output_line(ctx, "if (%s %s %s) { break; }", src0, comp, src1);
+} // emit_HLSL_BREAKC
+
+void emit_HLSL_MOVA(Context *ctx)
+{
+    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+
+    if (vecsize == 1)
+    {
+        make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                 "int(floor(abs(%s) + 0.5) * sign(%s))",
+                                 src0, src0);
+    } // if
+
+    else
+    {
+        make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                            "int%d(floor(abs(%s) + 0.5) * sign(%s))",
+                            vecsize, src0, src0);
+    } // else
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_MOVA
+
+void emit_HLSL_DEFB(Context *ctx)
+{
+    char varname[64]; get_HLSL_destarg_varname(ctx, varname, sizeof (varname));
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+    output_line(ctx, "const bool %s = %s;",
+                varname, ctx->dwords[0] ? "true" : "false");
+    ctx->indent--;
+    pop_output(ctx);
+} // emit_HLSL_DEFB
+
+void emit_HLSL_DEFI(Context *ctx)
+{
+    char varname[64]; get_HLSL_destarg_varname(ctx, varname, sizeof (varname));
+    const int32 *x = (const int32 *) ctx->dwords;
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+    output_line(ctx, "const int4 %s = int4(%d, %d, %d, %d);",
+                varname, (int) x[0], (int) x[1], (int) x[2], (int) x[3]);
+    ctx->indent--;
+    pop_output(ctx);
+} // emit_HLSL_DEFI
+
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXCRD)
+
+void emit_HLSL_TEXKILL(Context *ctx)
+{
+    char dst[64]; get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+    output_line(ctx, "if (any(%s.xyz < 0.0)) discard;", dst);
+} // emit_HLSL_TEXKILL
+
+void emit_HLSL_TEXLD(Context *ctx)
+{
+    if (!shader_version_atleast(ctx, 1, 4))
+    {
+        DestArgInfo *info = &ctx->dest_arg;
+        char dst[64];
+        char sampler[64];
+        char code[128] = {0};
+
+        RegisterList *sreg;
+        sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER, info->regnum);
+        const TextureType ttype = (TextureType) (sreg ? sreg->index : 0);
+
+        // !!! FIXME: this code counts on the register not having swizzles, etc.
+        get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+        get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                                sampler, sizeof (sampler));
+
+        if (ttype == TEXTURE_TYPE_2D)
+        {
+            make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                     "%s_texture.Sample(%s, %s.xy)",
+                                     sampler, sampler, dst);
+        } // if
+        else if (ttype == TEXTURE_TYPE_CUBE || ttype == TEXTURE_TYPE_VOLUME)
+        {
+            make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                     "%s_texture.Sample(%s, %s.xyz)",
+                                     sampler, sampler, dst);
+        } // else if
+        else
+        {
+            fail(ctx, "unexpected texture type");
+        } // else
+        output_line(ctx, "%s", code);
+    } // if
+
+    else if (!shader_version_atleast(ctx, 2, 0))
+    {
+        // ps_1_4 is different, too!
+        fail(ctx, "TEXLD == Shader Model 1.4 unimplemented.");  // !!! FIXME
+        return;
+    } // else if
+
+    else
+    {
+        const SourceArgInfo *samp_arg = &ctx->source_args[1];
+        RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
+                                          samp_arg->regnum);
+        const char *funcname = NULL;
+        char src0[64] = { '\0' };
+        char src1[64]; get_HLSL_srcarg_varname(ctx, 1, src1, sizeof (src1)); // !!! FIXME: SRC_MOD?
+
+        if (sreg == NULL)
+        {
+            fail(ctx, "TEXLD using undeclared sampler");
+            return;
+        } // if
+
+        // !!! FIXME: does the d3d bias value map directly to HLSL?
+        const char *biassep = "";
+        char bias[64] = { '\0' };
+        if (ctx->instruction_controls == CONTROL_TEXLDB)
+        {
+            biassep = ", ";
+            make_HLSL_srcarg_string_w(ctx, 0, bias, sizeof (bias));
+            funcname = "SampleBias";
+        } // if
+        else
+        {
+            funcname = "Sample";
+        } // else
+
+        switch ((const TextureType) sreg->index)
+        {
+            case TEXTURE_TYPE_2D:
+                    make_HLSL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
+                break;
+            case TEXTURE_TYPE_CUBE:
+                if (ctx->instruction_controls == CONTROL_TEXLDP)
+                    fail(ctx, "TEXLDP on a cubemap");  // !!! FIXME: is this legal?
+                make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+                break;
+            case TEXTURE_TYPE_VOLUME:
+                    make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+                break;
+            default:
+                fail(ctx, "unknown texture type");
+                return;
+        } // switch
+
+        assert(!isscalar(ctx, ctx->shader_type, samp_arg->regtype, samp_arg->regnum));
+        char swiz_str[6] = { '\0' };
+        make_HLSL_swizzle_string(swiz_str, sizeof (swiz_str),
+                                 samp_arg->swizzle, ctx->dest_arg.writemask);
+
+        char code[128];
+        make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                 "%s_texture.%s(%s, %s%s%s)%s", src1, funcname,
+                                 src1, src0, biassep, bias, swiz_str);
+
+        output_line(ctx, "%s", code);
+    } // else
+} // emit_HLSL_TEXLD
+
+void emit_HLSL_TEXBEM(Context *ctx)
+{
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    DestArgInfo *info = &ctx->dest_arg;
+    char dst[64]; get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+    char src[64]; get_HLSL_srcarg_varname(ctx, 0, src, sizeof (src));
+    char sampler[64];
+    char code[512];
+
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                            sampler, sizeof (sampler));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "%s_texture.Sample(%s, float2(%s.x + (%s_texbem.x * %s.x) + (%s_texbem.z * %s.y),"
+        " %s.y + (%s_texbem.y * %s.x) + (%s_texbem.w * %s.y)))",
+        sampler, sampler,
+        dst, sampler, src, sampler, src,
+        dst, sampler, src, sampler, src);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXBEM
+
+void emit_HLSL_TEXBEML(Context *ctx)
+{
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    DestArgInfo *info = &ctx->dest_arg;
+    char dst[64]; get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+    char src[64]; get_HLSL_srcarg_varname(ctx, 0, src, sizeof (src));
+    char sampler[64];
+    char code[512];
+
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                            sampler, sizeof (sampler));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "(%s_texture.Sample(%s, float2(%s.x + (%s_texbem.x * %s.x) + (%s_texbem.z * %s.y),"
+        " %s.y + (%s_texbem.y * %s.x) + (%s_texbem.w * %s.y)))) *"
+        " ((%s.z * %s_texbeml.x) + %s_texbem.y)",
+        sampler, sampler,
+        dst, sampler, src, sampler, src,
+        dst, sampler, src, sampler, src,
+        src, sampler, sampler);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXBEML
+
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2AR) // !!! FIXME
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2GB) // !!! FIXME
+
+void emit_HLSL_TEXM3X2PAD(Context *ctx)
+{
+    // no-op ... work happens in emit_HLSL_TEXM3X2TEX().
+} // emit_HLSL_TEXM3X2PAD
+
+void emit_HLSL_TEXM3X2TEX(Context *ctx)
+{
+    if (ctx->texm3x2pad_src0 == -1)
+        return;
+
+    DestArgInfo *info = &ctx->dest_arg;
+    char dst[64];
+    char src0[64];
+    char src1[64];
+    char src2[64];
+    char sampler[64];
+    char code[512];
+
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                            sampler, sizeof (sampler));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_src0,
+                            src0, sizeof (src0));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x2pad_dst0,
+                            src1, sizeof (src1));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
+                            src2, sizeof (src2));
+    get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "%s_texture.Sample(%s, float2(dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz)))",
+        sampler, sampler, src0, src1, src2, dst);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXM3X2TEX
+
+void emit_HLSL_TEXM3X3PAD(Context *ctx)
+{
+    // no-op ... work happens in emit_HLSL_TEXM3X3*().
+} // emit_HLSL_TEXM3X3PAD
+
+void emit_HLSL_TEXM3X3TEX(Context *ctx)
+{
+    if (ctx->texm3x3pad_src1 == -1)
+        return;
+
+    DestArgInfo *info = &ctx->dest_arg;
+    char dst[64];
+    char src0[64];
+    char src1[64];
+    char src2[64];
+    char src3[64];
+    char src4[64];
+    char sampler[64];
+    char code[512];
+
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                            sampler, sizeof (sampler));
+
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
+                            src0, sizeof (src0));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
+                            src1, sizeof (src1));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
+                            src2, sizeof (src2));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
+                            src3, sizeof (src3));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
+                            src4, sizeof (src4));
+    get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "%s_texture.Sample(%s,"
+            " float3(dot(%s.xyz, %s.xyz),"
+            " dot(%s.xyz, %s.xyz),"
+            " dot(%s.xyz, %s.xyz)))",
+        sampler, sampler, src0, src1, src2, src3, dst, src4);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXM3X3TEX
+
+void emit_HLSL_TEXM3X3SPEC_helper(Context *ctx)
+{
+    if (ctx->glsl_generated_texm3x3spec_helper)
+        return;
+
+    ctx->glsl_generated_texm3x3spec_helper = 1;
+
+    push_output(ctx, &ctx->helpers);
+    output_line(ctx, "float3 TEXM3X3SPEC_reflection(const float3 normal, const float3 eyeray)");
+    output_line(ctx, "{"); ctx->indent++;
+    output_line(ctx,   "return (2.0 * ((normal * eyeray) / (normal * normal)) * normal) - eyeray;"); ctx->indent--;
+    output_line(ctx, "}");
+    output_blank_line(ctx);
+    pop_output(ctx);
+} // emit_HLSL_TEXM3X3SPEC_helper
+
+void emit_HLSL_TEXM3X3SPEC(Context *ctx)
+{
+    if (ctx->texm3x3pad_src1 == -1)
+        return;
+
+    DestArgInfo *info = &ctx->dest_arg;
+    char dst[64];
+    char src0[64];
+    char src1[64];
+    char src2[64];
+    char src3[64];
+    char src4[64];
+    char src5[64];
+    char sampler[64];
+    char code[512];
+
+    emit_HLSL_TEXM3X3SPEC_helper(ctx);
+
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                            sampler, sizeof (sampler));
+
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
+                            src0, sizeof (src0));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
+                            src1, sizeof (src1));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
+                            src2, sizeof (src2));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
+                            src3, sizeof (src3));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
+                            src4, sizeof (src4));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[1].regnum,
+                            src5, sizeof (src5));
+    get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "%s_texture.Sample(%s, "
+            "TEXM3X3SPEC_reflection("
+                "float3("
+                    "dot(%s.xyz, %s.xyz), "
+                    "dot(%s.xyz, %s.xyz), "
+                    "dot(%s.xyz, %s.xyz)"
+                "),"
+                "%s.xyz,"
+            ")"
+        ")",
+        sampler, sampler, src0, src1, src2, src3, dst, src4, src5);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXM3X3SPEC
+
+void emit_HLSL_TEXM3X3VSPEC(Context *ctx)
+{
+    if (ctx->texm3x3pad_src1 == -1)
+        return;
+
+    DestArgInfo *info = &ctx->dest_arg;
+    char dst[64];
+    char src0[64];
+    char src1[64];
+    char src2[64];
+    char src3[64];
+    char src4[64];
+    char sampler[64];
+    char code[512];
+
+    emit_HLSL_TEXM3X3SPEC_helper(ctx);
+
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_SAMPLER, info->regnum,
+                            sampler, sizeof (sampler));
+
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
+                            src0, sizeof (src0));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
+                            src1, sizeof (src1));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
+                            src2, sizeof (src2));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
+                            src3, sizeof (src3));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
+                            src4, sizeof (src4));
+    get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "%s_texture.Sample(%s, "
+            "TEXM3X3SPEC_reflection("
+                "float3("
+                    "dot(%s.xyz, %s.xyz), "
+                    "dot(%s.xyz, %s.xyz), "
+                    "dot(%s.xyz, %s.xyz)"
+                "), "
+                "float3(%s.w, %s.w, %s.w)"
+            ")"
+        ")",
+        sampler, sampler, src0, src1, src2, src3, dst, src4, src0, src2, dst);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXM3X3VSPEC
+
+void emit_HLSL_EXPP(Context *ctx)
+{
+    // !!! FIXME: msdn's asm docs don't list this opcode, I'll have to check the driver documentation.
+    emit_HLSL_EXP(ctx);  // I guess this is just partial precision EXP?
+} // emit_HLSL_EXPP
+
+void emit_HLSL_LOGP(Context *ctx)
+{
+    // LOGP is just low-precision LOG, but we'll take the higher precision.
+    emit_HLSL_LOG(ctx);
+} // emit_HLSL_LOGP
+
+// common code between CMP and CND.
+void emit_HLSL_comparison_operations(Context *ctx, const char *cmp)
+{
+    int i, j;
+    DestArgInfo *dst = &ctx->dest_arg;
+    const SourceArgInfo *srcarg0 = &ctx->source_args[0];
+    const int origmask = dst->writemask;
+    int used_swiz[4] = { 0, 0, 0, 0 };
+    const int writemask[4] = { dst->writemask0, dst->writemask1,
+                               dst->writemask2, dst->writemask3 };
+    const int src0swiz[4] = { srcarg0->swizzle_x, srcarg0->swizzle_y,
+                              srcarg0->swizzle_z, srcarg0->swizzle_w };
+
+    for (i = 0; i < 4; i++)
+    {
+        int mask = (1 << i);
+
+        if (!writemask[i]) continue;
+        if (used_swiz[i]) continue;
+
+        // This is a swizzle we haven't checked yet.
+        used_swiz[i] = 1;
+
+        // see if there are any other elements swizzled to match (.yyyy)
+        for (j = i + 1; j < 4; j++)
+        {
+            if (!writemask[j]) continue;
+            if (src0swiz[i] != src0swiz[j]) continue;
+            mask |= (1 << j);
+            used_swiz[j] = 1;
+        } // for
+
+        // okay, (mask) should be the writemask of swizzles we like.
+
+        char src0[64];
+        char src1[64];
+        char src2[64];
+        make_HLSL_srcarg_string(ctx, 0, (1 << i), src0, sizeof (src0));
+        make_HLSL_srcarg_string(ctx, 1, mask, src1, sizeof (src1));
+        make_HLSL_srcarg_string(ctx, 2, mask, src2, sizeof (src2));
+
+        set_dstarg_writemask(dst, mask);
+
+        char code[128];
+        make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                                 "((%s %s) ? %s : %s)",
+                                 src0, cmp, src1, src2);
+        output_line(ctx, "%s", code);
+    } // for
+
+    set_dstarg_writemask(dst, origmask);
+} // emit_HLSL_comparison_operations
+
+void emit_HLSL_CND(Context *ctx)
+{
+    emit_HLSL_comparison_operations(ctx, "> 0.5");
+} // emit_HLSL_CND
+
+void emit_HLSL_DEF(Context *ctx)
+{
+    const float *val = (const float *) ctx->dwords; // !!! FIXME: could be int?
+    char varname[64]; get_HLSL_destarg_varname(ctx, varname, sizeof (varname));
+    char val0[32]; floatstr(ctx, val0, sizeof (val0), val[0], 1);
+    char val1[32]; floatstr(ctx, val1, sizeof (val1), val[1], 1);
+    char val2[32]; floatstr(ctx, val2, sizeof (val2), val[2], 1);
+    char val3[32]; floatstr(ctx, val3, sizeof (val3), val[3], 1);
+
+    push_output(ctx, &ctx->mainline_top);
+    ctx->indent++;
+    output_line(ctx, "const float4 %s = float4(%s, %s, %s, %s);",
+                varname, val0, val1, val2, val3);
+    ctx->indent--;
+    pop_output(ctx);
+} // emit_HLSL_DEF
+
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXREG2RGB) // !!! FIXME
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3TEX) // !!! FIXME
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXM3X2DEPTH) // !!! FIXME
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXDP3) // !!! FIXME
+
+void emit_HLSL_TEXM3X3(Context *ctx)
+{
+    if (ctx->texm3x3pad_src1 == -1)
+        return;
+
+    char dst[64];
+    char src0[64];
+    char src1[64];
+    char src2[64];
+    char src3[64];
+    char src4[64];
+    char code[512];
+
+    // !!! FIXME: this code counts on the register not having swizzles, etc.
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst0,
+                            src0, sizeof (src0));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src0,
+                            src1, sizeof (src1));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_dst1,
+                            src2, sizeof (src2));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->texm3x3pad_src1,
+                            src3, sizeof (src3));
+    get_HLSL_varname_in_buf(ctx, REG_TYPE_TEXTURE, ctx->source_args[0].regnum,
+                            src4, sizeof (src4));
+    get_HLSL_destarg_varname(ctx, dst, sizeof (dst));
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+        "float4(dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz), dot(%s.xyz, %s.xyz), 1.0)",
+        src0, src1, src2, src3, dst, src4);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXM3X3
+
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(TEXDEPTH) // !!! FIXME
+
+void emit_HLSL_CMP(Context *ctx)
+{
+    emit_HLSL_comparison_operations(ctx, ">= 0.0");
+} // emit_HLSL_CMP
+
+EMIT_HLSL_OPCODE_UNIMPLEMENTED_FUNC(BEM) // !!! FIXME
+
+void emit_HLSL_DP2ADD(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_vec2(ctx, 1, src1, sizeof (src1));
+    char src2[64]; make_HLSL_srcarg_string_scalar(ctx, 2, src2, sizeof (src2));
+    char extra[64]; snprintf(extra, sizeof (extra), " + %s", src2);
+    emit_HLSL_dotprod(ctx, src0, src1, extra);
+} // emit_HLSL_DP2ADD
+
+void emit_HLSL_DSX(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "ddx(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_DSX
+
+void emit_HLSL_DSY(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code), "ddy(%s)", src0);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_DSY
+
+void emit_HLSL_TEXLDD(Context *ctx)
+{
+    const SourceArgInfo *samp_arg = &ctx->source_args[1];
+    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
+                                          samp_arg->regnum);
+    char src0[64] = { '\0' };
+    char src1[64]; get_HLSL_srcarg_varname(ctx, 1, src1, sizeof (src1)); // !!! FIXME: SRC_MOD?
+    char src2[64] = { '\0' };
+    char src3[64] = { '\0' };
+
+    if (sreg == NULL)
+    {
+        fail(ctx, "TEXLDD using undeclared sampler");
+        return;
+    } // if
+
+    switch ((const TextureType) sreg->index)
+    {
+        case TEXTURE_TYPE_2D:
+            make_HLSL_srcarg_string_vec2(ctx, 0, src0, sizeof (src0));
+            make_HLSL_srcarg_string_vec2(ctx, 2, src2, sizeof (src2));
+            make_HLSL_srcarg_string_vec2(ctx, 3, src3, sizeof (src3));
+            break;
+        case TEXTURE_TYPE_CUBE:
+        case TEXTURE_TYPE_VOLUME:
+            make_HLSL_srcarg_string_vec3(ctx, 0, src0, sizeof (src0));
+            make_HLSL_srcarg_string_vec3(ctx, 2, src2, sizeof (src2));
+            make_HLSL_srcarg_string_vec3(ctx, 3, src3, sizeof (src3));
+            break;
+        default:
+            fail(ctx, "unknown texture type");
+            return;
+    } // switch
+
+    assert(!isscalar(ctx, ctx->shader_type, samp_arg->regtype, samp_arg->regnum));
+    char swiz_str[6] = { '\0' };
+    make_HLSL_swizzle_string(swiz_str, sizeof (swiz_str),
+                             samp_arg->swizzle, ctx->dest_arg.writemask);
+
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "%s_texture.SampleGrad(%s, %s, %s, %s)%s",
+                             src1, src1, src0, src2, src3, swiz_str);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXLDD
+
+void emit_HLSL_SETP(Context *ctx)
+{
+    const int vecsize = vecsize_from_writemask(ctx->dest_arg.writemask);
+    char src0[64]; make_HLSL_srcarg_string_masked(ctx, 0, src0, sizeof (src0));
+    char src1[64]; make_HLSL_srcarg_string_masked(ctx, 1, src1, sizeof (src1));
+    char code[128];
+
+    // destination is always predicate register (which is type bvec4).
+    const char *comp = (vecsize == 1) ?
+        get_HLSL_comparison_string_scalar(ctx) :
+        get_HLSL_comparison_string_vector(ctx);
+
+    make_HLSL_destarg_assign(ctx, code, sizeof (code),
+                             "(%s %s %s)", src0, comp, src1);
+    output_line(ctx, "%s", code);
+} // emit_HLSL_SETP
+
+void emit_HLSL_TEXLDL(Context *ctx)
+{
+    const SourceArgInfo *samp_arg = &ctx->source_args[1];
+    RegisterList *sreg = reglist_find(&ctx->samplers, REG_TYPE_SAMPLER,
+                                          samp_arg->regnum);
+    const char *pattern = NULL;
+    char src0[64];
+    char src1[64];
+    make_HLSL_srcarg_string_full(ctx, 0, src0, sizeof (src0));
+    get_HLSL_srcarg_varname(ctx, 1, src1, sizeof (src1)); // !!! FIXME: SRC_MOD?
+
+    if (sreg == NULL)
+    {
+        fail(ctx, "TEXLDL using undeclared sampler");
+        return;
+    } // if
+
+    switch ((const TextureType) sreg->index)
+    {
+        case TEXTURE_TYPE_2D:
+            pattern = "%s_texture.SampleLevel(%s, %s.xy, %s.w)%s";
+            break;
+        case TEXTURE_TYPE_CUBE:
+        case TEXTURE_TYPE_VOLUME:
+            pattern = "%s_texture.SampleLevel(%s, %s.xyz, %s.w)%s";
+            break;
+        default:
+            fail(ctx, "unknown texture type");
+            return;
+    } // switch
+
+    assert(!isscalar(ctx, ctx->shader_type, samp_arg->regtype, samp_arg->regnum));
+    char swiz_str[6] = { '\0' };
+    make_HLSL_swizzle_string(swiz_str, sizeof (swiz_str),
+                             samp_arg->swizzle, ctx->dest_arg.writemask);
+
+    char code[128];
+    make_HLSL_destarg_assign(ctx, code, sizeof(code),
+        pattern, src1, src1, src0, src0, swiz_str);
+
+    output_line(ctx, "%s", code);
+} // emit_HLSL_TEXLDL
+
+void emit_HLSL_BREAKP(Context *ctx)
+{
+    char src0[64]; make_HLSL_srcarg_string_scalar(ctx, 0, src0, sizeof (src0));
+    output_line(ctx, "if (%s) { break; }", src0);
+} // emit_HLSL_BREAKP
+
+void emit_HLSL_RESERVED(Context *ctx)
+{
+    // do nothing; fails in the state machine.
+} // emit_HLSL_RESERVED
+
+#endif  // SUPPORT_PROFILE_HLSL
+
+#pragma GCC visibility pop