Skip to content

Commit

Permalink
Optimized the mixing code.
Browse files Browse the repository at this point in the history
Loop unrolling and restrict keywords, plus an SSE version. Even the scalar
version is more than twice as fast as the previous code!
  • Loading branch information
icculus committed Apr 23, 2018
1 parent c1e20ab commit af0fb82
Showing 1 changed file with 228 additions and 31 deletions.
259 changes: 228 additions & 31 deletions mojoal.c
Expand Up @@ -13,6 +13,10 @@
#include <math.h>
#include <float.h>

#ifdef __SSE__
#include <mmintrin.h>
#endif

#include "AL/al.h"
#include "AL/alc.h"
#include "SDL.h"
Expand Down Expand Up @@ -241,6 +245,8 @@ static ALCsizei ring_buffer_get(RingBuffer *ring, void *_data, ALCsizei size)
return size; /* may have been clamped if there wasn't enough data... */
}

typedef void (*BufferMixerFn)(const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes);

typedef struct ALbuffer
{
SDL_atomic_t allocated;
Expand All @@ -250,6 +256,7 @@ typedef struct ALbuffer
ALsizei frequency;
ALsizei len; /* length of data in bytes. */
const float *data; /* we only work in Float32 format. */
BufferMixerFn mixer;
SDL_atomic_t refcount; /* if zero, can be deleted or alBufferData'd */
} ALbuffer;

Expand Down Expand Up @@ -536,45 +543,215 @@ static void obtain_newly_queued_buffers(BufferQueue *queue)
queue_new_buffer_items_recursive(queue, items);
}

static void mix_source_data_float32(ALCcontext *ctx, ALsource *src, const int channels, const ALfloat *panning, const float *data, float *stream, const ALsizei mixframes)

static void mix_float32_c1_scalar(const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes)
{
const ALfloat left = panning[0];
const ALfloat right = panning[1];
const int unrolled = mixframes / 4;
const int leftover = mixframes % 4;
ALsizei i;

if ((left == 0.0f) && (right == 0.0f)) {
return; /* it's silent, don't spend time mixing it. */
if ((left == 1.0f) && (right == 1.0f)) {
for (i = 0; i < unrolled; i++, data += 4, stream += 8) {
const float samp0 = data[0];
const float samp1 = data[1];
const float samp2 = data[2];
const float samp3 = data[3];
stream[0] += samp0;
stream[1] += samp0;
stream[2] += samp1;
stream[3] += samp1;
stream[4] += samp2;
stream[5] += samp2;
stream[6] += samp3;
stream[7] += samp3;
}
for (i = 0; i < leftover; i++, stream += 2) {
const float samp = *(data++);
stream[0] += samp;
stream[1] += samp;
}
} else {
for (i = 0; i < unrolled; i++, data += 4, stream += 8) {
const float samp0 = data[0];
const float samp1 = data[1];
const float samp2 = data[2];
const float samp3 = data[3];
stream[0] += samp0 * left;
stream[1] += samp0 * right;
stream[2] += samp1 * left;
stream[3] += samp1 * right;
stream[4] += samp2 * left;
stream[5] += samp2 * right;
stream[6] += samp3 * left;
stream[7] += samp3 * right;
}
for (i = 0; i < leftover; i++, stream += 2) {
const float samp = *(data++);
stream[0] += samp * left;
stream[1] += samp * right;
}
}
}

FIXME("make this efficient. :)");
FIXME("currently expects output to be stereo");
static void mix_float32_c2_scalar(const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes)
{
const ALfloat left = panning[0];
const ALfloat right = panning[1];
const int unrolled = mixframes / 4;
const int leftover = mixframes % 4;
ALsizei i;

if (channels == 1) {
if ((left == 1.0f) && (right == 1.0f)) {
for (i = 0; i < mixframes; i++) {
const float samp = *(data++);
*(stream++) += samp;
*(stream++) += samp;
}
} else {
for (i = 0; i < mixframes; i++) {
const float samp = *(data++);
*(stream++) += samp * left;
*(stream++) += samp * right;
}
if ((left == 1.0f) && (right == 1.0f)) {
for (i = 0; i < unrolled; i++, stream += 8, data += 8) {
stream[0] += data[0];
stream[1] += data[1];
stream[2] += data[2];
stream[3] += data[3];
stream[4] += data[4];
stream[5] += data[5];
stream[6] += data[6];
stream[7] += data[7];
}
} else if (channels == 2) {
if ((left == 1.0f) && (right == 1.0f)) {
for (i = 0; i < mixframes; i++) {
*(stream++) += *(data++);
*(stream++) += *(data++);
}
} else {
for (i = 0; i < mixframes; i++) {
*(stream++) += *(data++) * left;
*(stream++) += *(data++) * right;
for (i = 0; i < leftover; i++, stream += 2, data += 2) {
stream[0] += data[0];
stream[1] += data[1];
}
} else {
for (i = 0; i < unrolled; i++, stream += 8, data += 8) {
stream[0] += data[0] * left;
stream[1] += data[1] * right;
stream[2] += data[2] * left;
stream[3] += data[3] * right;
stream[4] += data[4] * left;
stream[5] += data[5] * right;
stream[6] += data[6] * left;
stream[7] += data[7] * right;
}
for (i = 0; i < leftover; i++, stream += 2, data += 2) {
stream[0] += data[0] * left;
stream[1] += data[1] * right;
}
}
}


#ifdef __SSE__
static void mix_float32_c1_sse(const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes)
{
const ALfloat left = panning[0];
const ALfloat right = panning[1];
const int unrolled = mixframes / 8;
const int leftover = mixframes % 8;
ALsizei i;

if ( (((size_t)stream) % 16) || (((size_t)data) % 16) ) {
/* unaligned, do scalar version. */
mix_float32_c1_scalar(panning, data, stream, mixframes);
} else if ((left == 1.0f) && (right == 1.0f)) {
for (i = 0; i < unrolled; i++, data += 8, stream += 16) {
/* We have 8 SSE registers, load 6 of them, have two for math (unrolled once). */
{
const __m128 vdataload1 = _mm_load_ps(data);
const __m128 vdataload2 = _mm_load_ps(data+4);
const __m128 vstream1 = _mm_load_ps(stream);
const __m128 vstream2 = _mm_load_ps(stream+4);
const __m128 vstream3 = _mm_load_ps(stream+8);
const __m128 vstream4 = _mm_load_ps(stream+12);
_mm_store_ps(stream, _mm_add_ps(vstream1, _mm_shuffle_ps(vdataload1, vdataload1, _MM_SHUFFLE(0, 0, 1, 1))));
_mm_store_ps(stream+4, _mm_add_ps(vstream2, _mm_shuffle_ps(vdataload1, vdataload1, _MM_SHUFFLE(2, 2, 3, 3))));
_mm_store_ps(stream+8, _mm_add_ps(vstream3, _mm_shuffle_ps(vdataload2, vdataload2, _MM_SHUFFLE(0, 0, 1, 1))));
_mm_store_ps(stream+12, _mm_add_ps(vstream4, _mm_shuffle_ps(vdataload2, vdataload2, _MM_SHUFFLE(2, 2, 3, 3))));
}
}
for (i = 0; i < leftover; i++, stream += 2) {
const float samp = *(data++);
stream[0] += samp;
stream[1] += samp;
}
} else {
const __m128 vleftright = { left, right, left, right };
for (i = 0; i < unrolled; i++, data += 8, stream += 16) {
/* We have 8 SSE registers, load 6 of them, have two for math (unrolled once). */
const __m128 vdataload1 = _mm_load_ps(data);
const __m128 vdataload2 = _mm_load_ps(data+4);
const __m128 vstream1 = _mm_load_ps(stream);
const __m128 vstream2 = _mm_load_ps(stream+4);
const __m128 vstream3 = _mm_load_ps(stream+8);
const __m128 vstream4 = _mm_load_ps(stream+12);
_mm_store_ps(stream, _mm_add_ps(vstream1, _mm_mul_ps(_mm_shuffle_ps(vdataload1, vdataload1, _MM_SHUFFLE(0, 0, 1, 1)), vleftright)));
_mm_store_ps(stream+4, _mm_add_ps(vstream2, _mm_mul_ps(_mm_shuffle_ps(vdataload1, vdataload1, _MM_SHUFFLE(2, 2, 3, 3)), vleftright)));
_mm_store_ps(stream+8, _mm_add_ps(vstream3, _mm_mul_ps(_mm_shuffle_ps(vdataload2, vdataload2, _MM_SHUFFLE(0, 0, 1, 1)), vleftright)));
_mm_store_ps(stream+12, _mm_add_ps(vstream4, _mm_mul_ps(_mm_shuffle_ps(vdataload2, vdataload2, _MM_SHUFFLE(2, 2, 3, 3)), vleftright)));
}
for (i = 0; i < leftover; i++, stream += 2) {
const float samp = *(data++);
stream[0] += samp * left;
stream[1] += samp * right;
}
}
}

static void mix_float32_c2_sse(const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes)
{
const ALfloat left = panning[0];
const ALfloat right = panning[1];
const int unrolled = mixframes / 4;
const int leftover = mixframes % 4;
ALsizei i;

/* We can align this to 16 in one special case. */
if ( ((((size_t)stream) % 16) == 8) && ((((size_t)data) % 16) == 8) && mixframes ) {
stream[0] += data[0] * left;
stream[1] += data[1] * right;
stream += 2;
data += 2;
mix_float32_c2_sse(panning, data + 2, stream + 2, mixframes - 1);
return;
}

if ( (((size_t)stream) % 16) || (((size_t)data) % 16) ) {
/* unaligned, do scalar version. */
mix_float32_c2_scalar(panning, data, stream, mixframes);
} else if ((left == 1.0f) && (right == 1.0f)) {
for (i = 0; i < unrolled; i++, data += 8, stream += 8) {
const __m128 vdata1 = _mm_load_ps(data);
const __m128 vdata2 = _mm_load_ps(data+4);
const __m128 vstream1 = _mm_load_ps(stream);
const __m128 vstream2 = _mm_load_ps(stream+4);
_mm_store_ps(stream, _mm_add_ps(vstream1, vdata1));
_mm_store_ps(stream+4, _mm_add_ps(vstream2, vdata2));
}
for (i = 0; i < leftover; i++, stream += 2, data += 2) {
stream[0] += data[0];
stream[1] += data[1];
}
} else {
const __m128 vleftright = { left, right, left, right };
for (i = 0; i < unrolled; i++, data += 8, stream += 8) {
const __m128 vdata1 = _mm_load_ps(data);
const __m128 vdata2 = _mm_load_ps(data+4);
const __m128 vstream1 = _mm_load_ps(stream);
const __m128 vstream2 = _mm_load_ps(stream+4);
_mm_store_ps(stream, _mm_add_ps(vstream1, _mm_mul_ps(vdata1, vleftright)));
_mm_store_ps(stream+4, _mm_add_ps(vstream2, _mm_mul_ps(vdata2, vleftright)));
}
for (i = 0; i < leftover; i++, stream += 2, data += 2) {
stream[0] += data[0] * left;
stream[1] += data[1] * right;
}
}
}
#endif

static void mix_buffer(const ALbuffer *buffer, const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes)
{
const ALfloat left = panning[0];
const ALfloat right = panning[1];
FIXME("currently expects output to be stereo");
if ((left != 0.0f) || (right != 0.0f)) { /* don't bother mixing in silence. */
buffer->mixer(panning, data, stream, mixframes);
}
}

Expand Down Expand Up @@ -611,15 +788,15 @@ static ALboolean mix_source_buffer(ALCcontext *ctx, ALsource *src, BufferQueueIt
const int mixbufframes = mixbuflen / bufferframesize;
const int getframes = SDL_min(remainingmixframes, mixbufframes);
SDL_AudioStreamGet(src->stream, mixbuf, getframes * bufferframesize);
mix_source_data_float32(ctx, src, buffer->channels, panning, mixbuf, *stream, getframes);
mix_buffer(buffer, panning, mixbuf, *stream, getframes);
*len -= getframes * deviceframesize;
*stream += getframes * ctx->device->channels;
remainingmixframes -= getframes;
}
} else {
const int framesavail = (buffer->len - src->offset) / bufferframesize;
const int mixframes = SDL_min(framesneeded, framesavail);
mix_source_data_float32(ctx, src, buffer->channels, panning, data, *stream, mixframes);
mix_buffer(buffer, panning, data, *stream, mixframes);
src->offset += mixframes * bufferframesize;
*len -= mixframes * deviceframesize;
*stream += mixframes * ctx->device->channels;
Expand Down Expand Up @@ -3148,6 +3325,9 @@ void alSourceUnqueueBuffers(ALuint name, ALsizei nb, ALuint *bufnames)
} while (!SDL_AtomicCASPtr(&ctx->device->playback.buffer_queue_pool, ptr, queue));
}

static void mix_no_op(const ALfloat * restrict panning, const float * restrict data, float * restrict stream, const ALsizei mixframes) { /* does nothing. */ }


void alGenBuffers(ALsizei n, ALuint *names)
{
ALCcontext *ctx = get_current_context();
Expand Down Expand Up @@ -3223,6 +3403,7 @@ void alGenBuffers(ALsizei n, ALuint *names)
buffer->frequency = 0;
buffer->len = 0;
buffer->data = NULL;
buffer->mixer = mix_no_op;
SDL_AtomicSet(&buffer->refcount, 0);
SDL_AtomicSet(&buffer->allocated, 1); /* we officially own it. */
}
Expand Down Expand Up @@ -3289,6 +3470,7 @@ void alBufferData(ALuint name, ALenum alfmt, const ALvoid *data, ALsizei size, A
ALCsizei framesize;
int rc;
int prevrefcount;
BufferMixerFn mixer;

if (!buffer) return;

Expand All @@ -3297,6 +3479,21 @@ void alBufferData(ALuint name, ALenum alfmt, const ALvoid *data, ALsizei size, A
return;
}

if (channels == 1) {
#ifdef __SSE__
if (SDL_HasSSE()) { mixer = mix_float32_c1_sse; } else
#endif
{ mixer = mix_float32_c1_scalar; }
} else if (channels == 2) {
#ifdef __SSE__
if (SDL_HasSSE()) { mixer = mix_float32_c2_sse; } else
#endif
{ mixer = mix_float32_c2_scalar; }
} else {
SDL_assert(!"uhoh, no mixer function for this format!");
mixer = mix_no_op;
}

/* increment refcount so this can't be deleted or alBufferData'd from another thread */
prevrefcount = SDL_AtomicIncRef(&buffer->refcount);
SDL_assert(prevrefcount >= 0);
Expand Down Expand Up @@ -3350,7 +3547,7 @@ void alBufferData(ALuint name, ALenum alfmt, const ALvoid *data, ALsizei size, A
buffer->bits = (ALint) SDL_AUDIO_BITSIZE(sdlfmt); /* we're in float32, though. */
buffer->frequency = freq;
buffer->len = (ALsizei) sdlcvt.len_cvt;

buffer->mixer = mixer;
(void) SDL_AtomicDecRef(&buffer->refcount); /* ready to go! */
}

Expand Down

0 comments on commit af0fb82

Please sign in to comment.