Added release/acquire memory barriers to the atomic API
authorSam Lantinga <slouken@libsdl.org>
Wed, 10 Jul 2013 18:31:17 -0700
changeset 7393 358696c354a8
parent 7392 7e32fcb41b44
child 7394 38dc4961ab15
Added release/acquire memory barriers to the atomic API * Added a destructor to clean up TLS memory at thread shutdown * Refactored the TLS code to have platform independent code and a small platform dependent core with a fallback to generic code if platform dependent functions fail. * Fixed recursion issues with SDL_GetErrBuf()
include/SDL_atomic.h
include/SDL_thread.h
src/thread/SDL_systhread.h
src/thread/SDL_thread.c
src/thread/SDL_thread_c.h
src/thread/beos/SDL_systls.c
src/thread/generic/SDL_systls.c
src/thread/pthread/SDL_systls.c
src/thread/windows/SDL_systls.c
test/testthread.c
--- a/include/SDL_atomic.h	Wed Jul 10 02:37:57 2013 -0700
+++ b/include/SDL_atomic.h	Wed Jul 10 18:31:17 2013 -0700
@@ -45,6 +45,7 @@
  *
  * There's also lots of good information here:
  * http://www.1024cores.net/home/lock-free-algorithms
+ * http://preshing.com/
  *
  * These operations may or may not actually be implemented using
  * processor specific atomic operations. When possible they are
@@ -135,6 +136,32 @@
 { SDL_SpinLock _tmp = 0; SDL_AtomicLock(&_tmp); SDL_AtomicUnlock(&_tmp); }
 #endif
 
+/**
+ * Memory barriers are designed to prevent reads and writes from being
+ * reordered by the compiler and being seen out of order on multi-core CPUs.
+ *
+ * A typical pattern would be for thread A to write some data and a flag,
+ * and for thread B to read the flag and get the data. In this case you
+ * would insert a release barrier between writing the data and the flag,
+ * guaranteeing that the data write completes no later than the flag is
+ * written, and you would insert an acquire barrier between reading the
+ * flag and reading the data, to ensure that all the reads associated
+ * with the flag have completed.
+ *
+ * In this pattern you should always see a release barrier paired with
+ * an acquire barrier and you should gate the data reads/writes with a
+ * single flag variable.
+ *
+ * For more information on these semantics, take a look at the blog post:
+ * http://preshing.com/20120913/acquire-and-release-semantics
+ */
+/* FIXME: This is correct for x86 and x64 but not other CPUs
+   For PPC we need the lwsync instruction, and on ARM some variant of dmb
+ */
+#define SDL_MemoryBarrierRelease()  SDL_CompilerBarrier()
+#define SDL_MemoryBarrierAcquire()  SDL_CompilerBarrier()
+
+
 /* Platform specific optimized versions of the atomic functions,
  * you can disable these by defining SDL_DISABLE_ATOMIC_INLINE
  */
--- a/include/SDL_thread.h	Wed Jul 10 02:37:57 2013 -0700
+++ b/include/SDL_thread.h	Wed Jul 10 18:31:17 2013 -0700
@@ -48,8 +48,8 @@
 /* The SDL thread ID */
 typedef unsigned long SDL_threadID;
 
-/* Thread local storage ID */
-typedef int SDL_TLSID;
+/* Thread local storage ID, 0 is the invalid ID */
+typedef unsigned SDL_TLSID;
 
 /* The SDL thread priority
  *
@@ -219,13 +219,14 @@
  *
  *  \param id The thread local storage ID
  *  \param value The value to associate with the ID for the current thread
+ *  \param destructor A function called when the thread exits, to free the value.
  *
  *  \return 0 on success, -1 on error
  *
  *  \sa SDL_TLSCreate()
  *  \sa SDL_TLSGet()
  */
-extern DECLSPEC int SDLCALL SDL_TLSSet(SDL_TLSID id, const void *value);
+extern DECLSPEC int SDLCALL SDL_TLSSet(SDL_TLSID id, const void *value, void (*destructor)(void*));
 
 
 /* Ends C function definitions when using C++ */
--- a/src/thread/SDL_systhread.h	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/SDL_systhread.h	Wed Jul 10 18:31:17 2013 -0700
@@ -50,6 +50,12 @@
  */
 extern void SDL_SYS_WaitThread(SDL_Thread * thread);
 
+/* Get the thread local storage for this thread */
+extern SDL_TLSData *SDL_SYS_GetTLSData();
+
+/* Set the thread local storage for this thread */
+extern int SDL_SYS_SetTLSData(SDL_TLSData *data);
+
 #endif /* _SDL_systhread_h */
 
 /* vi: set ts=4 sw=4 expandtab: */
--- a/src/thread/SDL_thread.c	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/SDL_thread.c	Wed Jul 10 18:31:17 2013 -0700
@@ -28,38 +28,219 @@
 #include "../SDL_error_c.h"
 
 
+SDL_TLSID
+SDL_TLSCreate()
+{
+    static SDL_atomic_t SDL_tls_id;
+    return SDL_AtomicIncRef(&SDL_tls_id)+1;
+}
+
+void *
+SDL_TLSGet(SDL_TLSID id)
+{
+    SDL_TLSData *storage;
+
+    storage = SDL_SYS_GetTLSData();
+    if (!storage || id == 0 || id > storage->limit) {
+        return NULL;
+    }
+    return storage->array[id-1].data;
+}
+
+int
+SDL_TLSSet(SDL_TLSID id, const void *value, void (*destructor)(void *))
+{
+    SDL_TLSData *storage;
+
+    if (id == 0) {
+        return SDL_InvalidParamError("id");
+    }
+
+    storage = SDL_SYS_GetTLSData();
+    if (!storage || id > storage->limit) {
+        int i, oldlimit, newlimit;
+
+        oldlimit = storage ? storage->limit : 0;
+        newlimit = (id + TLS_ALLOC_CHUNKSIZE);
+        storage = (SDL_TLSData *)SDL_realloc(storage, sizeof(*storage)+(newlimit-1)*sizeof(storage->array[0]));
+        if (!storage) {
+            return SDL_OutOfMemory();
+        }
+        storage->limit = newlimit;
+        for (i = oldlimit; i < newlimit; ++i) {
+            storage->array[i].data = NULL;
+            storage->array[i].destructor = NULL;
+        }
+        if (SDL_SYS_SetTLSData(storage) != 0) {
+            return -1;
+        }
+    }
+
+    storage->array[id-1].data = SDL_const_cast(void*, value);
+    storage->array[id-1].destructor = destructor;
+    return 0;
+}
+
+static void
+SDL_TLSCleanup()
+{
+    SDL_TLSData *storage;
+
+    storage = SDL_SYS_GetTLSData();
+    if (storage) {
+        int i;
+        for (i = 0; i < storage->limit; ++i) {
+            if (storage->array[i].destructor) {
+                storage->array[i].destructor(storage->array[i].data);
+            }
+        }
+        SDL_SYS_SetTLSData(NULL);
+        SDL_free(storage);
+    }
+}
+
+
+/* This is a generic implementation of thread-local storage which doesn't
+   require additional OS support.
+
+   It is not especially efficient and doesn't clean up thread-local storage
+   as threads exit.  If there is a real OS that doesn't support thread-local
+   storage this implementation should be improved to be production quality.
+*/
+
+typedef struct SDL_TLSEntry {
+    SDL_threadID thread;
+    SDL_TLSData *storage;
+    struct SDL_TLSEntry *next;
+} SDL_TLSEntry;
+
+static SDL_mutex *SDL_generic_TLS_mutex;
+static SDL_TLSEntry *SDL_generic_TLS;
+
+
+SDL_TLSData *
+SDL_Generic_GetTLSData()
+{
+    SDL_threadID thread = SDL_ThreadID();
+    SDL_TLSEntry *entry;
+    SDL_TLSData *storage = NULL;
+
+    if (!SDL_generic_TLS_mutex) {
+        static SDL_SpinLock tls_lock;
+        SDL_AtomicLock(&tls_lock);
+        if (!SDL_generic_TLS_mutex) {
+            SDL_mutex *mutex = SDL_CreateMutex();
+            SDL_MemoryBarrierRelease();
+            SDL_generic_TLS_mutex = mutex;
+            if (!SDL_generic_TLS_mutex) {
+                SDL_AtomicUnlock(&tls_lock);
+                return NULL;
+            }
+        }
+        SDL_AtomicUnlock(&tls_lock);
+    }
+
+    SDL_MemoryBarrierAcquire();
+    SDL_LockMutex(SDL_generic_TLS_mutex);
+    for (entry = SDL_generic_TLS; entry; entry = entry->next) {
+        if (entry->thread == thread) {
+            storage = entry->storage;
+            break;
+        }
+    }
+    SDL_UnlockMutex(SDL_generic_TLS_mutex);
+
+    return storage;
+}
+
+int
+SDL_Generic_SetTLSData(SDL_TLSData *storage)
+{
+    SDL_threadID thread = SDL_ThreadID();
+    SDL_TLSEntry *prev, *entry;
+
+    /* SDL_Generic_GetTLSData() is always called first, so we can assume SDL_generic_TLS_mutex */
+    SDL_LockMutex(SDL_generic_TLS_mutex);
+    prev = NULL;
+    for (entry = SDL_generic_TLS; entry; entry = entry->next) {
+        if (entry->thread == thread) {
+            if (storage) {
+                entry->storage = storage;
+            } else {
+                if (prev) {
+                    prev->next = entry->next;
+                } else {
+                    SDL_generic_TLS = entry->next;
+                }
+                SDL_free(entry);
+            }
+            break;
+        }
+        prev = entry;
+    }
+    if (!entry) {
+        entry = (SDL_TLSEntry *)SDL_malloc(sizeof(*entry));
+        if (entry) {
+            entry->thread = thread;
+            entry->storage = storage;
+            entry->next = SDL_generic_TLS;
+            SDL_generic_TLS = entry;
+        }
+    }
+    SDL_UnlockMutex(SDL_generic_TLS_mutex);
+
+    if (!entry) {
+        return SDL_OutOfMemory();
+    }
+    return 0;
+}
+
 /* Routine to get the thread-specific error variable */
 SDL_error *
 SDL_GetErrBuf(void)
 {
-    static SDL_SpinLock spinlock;
+    static SDL_SpinLock tls_lock;
     static SDL_bool tls_being_created;
     static SDL_TLSID tls_errbuf;
     static SDL_error SDL_global_errbuf;
+    const SDL_error *ALLOCATION_IN_PROGRESS = (SDL_error *)-1;
     SDL_error *errbuf;
 
+    /* tls_being_created is there simply to prevent recursion if SDL_TLSCreate() fails.
+       It also means it's possible for another thread to also use SDL_global_errbuf,
+       but that's very unlikely and hopefully won't cause issues.
+     */
     if (!tls_errbuf && !tls_being_created) {
-        SDL_AtomicLock(&spinlock);
+        SDL_AtomicLock(&tls_lock);
         if (!tls_errbuf) {
-            /* SDL_TLSCreate() could fail and call SDL_SetError() */
+            SDL_TLSID slot;
             tls_being_created = SDL_TRUE;
-            tls_errbuf = SDL_TLSCreate();
+            slot = SDL_TLSCreate();
             tls_being_created = SDL_FALSE;
+            SDL_MemoryBarrierRelease();
+            tls_errbuf = slot;
         }
-        SDL_AtomicUnlock(&spinlock);
+        SDL_AtomicUnlock(&tls_lock);
     }
     if (!tls_errbuf) {
         return &SDL_global_errbuf;
     }
 
-    errbuf = SDL_TLSGet(tls_errbuf);
+    SDL_MemoryBarrierAcquire();
+    errbuf = (SDL_error *)SDL_TLSGet(tls_errbuf);
+    if (errbuf == ALLOCATION_IN_PROGRESS) {
+        return &SDL_global_errbuf;
+    }
     if (!errbuf) {
+        /* Mark that we're in the middle of allocating our buffer */
+        SDL_TLSSet(tls_errbuf, ALLOCATION_IN_PROGRESS, NULL);
         errbuf = (SDL_error *)SDL_malloc(sizeof(*errbuf));
         if (!errbuf) {
+            SDL_TLSSet(tls_errbuf, NULL, NULL);
             return &SDL_global_errbuf;
         }
         SDL_zerop(errbuf);
-        SDL_TLSSet(tls_errbuf, errbuf);
+        SDL_TLSSet(tls_errbuf, errbuf, SDL_free);
     }
     return errbuf;
 }
@@ -82,9 +263,7 @@
     void *userdata = args->data;
     int *statusloc = &args->info->status;
 
-    /* Perform any system-dependent setup
-       - this function cannot fail, and cannot use SDL_SetError()
-     */
+    /* Perform any system-dependent setup - this function may not fail */
     SDL_SYS_SetupThread(args->info->name);
 
     /* Get the thread id */
@@ -95,6 +274,9 @@
 
     /* Run the function */
     *statusloc = userfunc(userdata);
+
+    /* Clean up thread-local storage */
+    SDL_TLSCleanup();
 }
 
 #ifdef SDL_PASSED_BEGINTHREAD_ENDTHREAD
--- a/src/thread/SDL_thread_c.h	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/SDL_thread_c.h	Wed Jul 10 18:31:17 2013 -0700
@@ -56,6 +56,30 @@
 /* This is the function called to run a thread */
 extern void SDL_RunThread(void *data);
 
+/* This is the system-independent thread local storage structure */
+typedef struct {
+    int limit;
+    struct {
+        void *data;
+        void (*destructor)(void*);
+    } array[1];
+} SDL_TLSData;
+
+/* This is how many TLS entries we allocate at once */
+#define TLS_ALLOC_CHUNKSIZE 4
+
+/* Get cross-platform, slow, thread local storage for this thread.
+   This is only intended as a fallback if getting real thread-local
+   storage fails or isn't supported on this platform.
+ */
+extern SDL_TLSData *SDL_Generic_GetTLSData();
+
+/* Set cross-platform, slow, thread local storage for this thread.
+   This is only intended as a fallback if getting real thread-local
+   storage fails or isn't supported on this platform.
+ */
+extern int SDL_Generic_SetTLSData(SDL_TLSData *data);
+
 #endif /* _SDL_thread_c_h */
 
 /* vi: set ts=4 sw=4 expandtab: */
--- a/src/thread/beos/SDL_systls.c	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/beos/SDL_systls.c	Wed Jul 10 18:31:17 2013 -0700
@@ -21,81 +21,47 @@
 
 #include "SDL_config.h"
 #include "SDL_thread.h"
+#include "../SDL_thread_c.h"
 
 #if SDL_THREAD_BEOS
 
 #include <support/TLS.h>
 
 
-#define TLS_ALLOC_CHUNKSIZE 8
-
-typedef struct {
-    int limit;
-    void *data[1];
-} SDL_TLSData;
-
-static SDL_SpinLock tls_lock;
 static int32 thread_local_storage = B_NO_MEMORY;
-static SDL_atomic_t tls_id;
-
+static SDL_bool generic_local_storage = SDL_FALSE;
 
-SDL_TLSID
-SDL_TLSCreate()
+SDL_TLSData *
+SDL_SYS_GetTLSData()
 {
-    if (thread_local_storage == B_NO_MEMORY) {
-        SDL_AtomicLock(&tls_lock);
-        if (thread_local_storage == B_NO_MEMORY) {
-            thread_local_storage = tls_allocate();
-            if (thread_local_storage == B_NO_MEMORY) {
-                SDL_SetError("tls_allocate() failed");
-                SDL_AtomicUnlock(&tls_lock);
-                return 0;
+    if (thread_local_storage == B_NO_MEMORY && !generic_local_storage) {
+        static SDL_SpinLock lock;
+        SDL_AtomicLock(&lock);
+        if (thread_local_storage == B_NO_MEMORY && !generic_local_storage) {
+            int32 storage = tls_allocate();
+            if (storage != B_NO_MEMORY) {
+                SDL_MemoryBarrierRelease();
+                thread_local_storage = storage;
+            } else {
+                generic_local_storage = SDL_TRUE;
             }
         }
-        SDL_AtomicUnlock(&tls_lock);
+        SDL_AtomicUnlock(&lock);
     }
-    return SDL_AtomicIncRef(&tls_id)+1;
-}
-
-void *
-SDL_TLSGet(SDL_TLSID id)
-{
-    SDL_TLSData *data;
-
-    data = (SDL_TLSData *)tls_get(thread_local_storage);
-    if (!data || id <= 0 || id > data->limit) {
-        return NULL;
+    if (generic_local_storage) {
+        return SDL_Generic_GetTLSData();
     }
-    return data->data[id-1];
+    SDL_MemoryBarrierAcquire();
+    return (SDL_TLSData *)tls_get(thread_local_storage);
 }
 
 int
-SDL_TLSSet(SDL_TLSID id, const void *value)
+SDL_SYS_SetTLSData(SDL_TLSData *data)
 {
-    SDL_TLSData *data;
-
-    if (thread_local_storage == B_NO_MEMORY || id <= 0) {
-        return SDL_InvalidParamError(id);
+    if (generic_local_storage) {
+        return SDL_Generic_SetTLSData(data);
     }
-
-    data = (SDL_TLSData *)tls_get(thread_local_storage);
-    if (!data || id > data->limit) {
-        int i, oldlimit, newlimit;
-
-        oldlimit = data ? data->limit : 0;
-        newlimit = (id + TLS_ALLOC_CHUNKSIZE);
-        data = (SDL_TLSData *)SDL_realloc(data, sizeof(*data)+(newlimit-1)*sizeof(void*));
-        if (!data) {
-            return SDL_OutOfMemory();
-        }
-        data->limit = newlimit;
-        for (i = oldlimit; i < newlimit; ++i) {
-            data->data[i] = NULL;
-        }
-        tls_set(thread_local_storage, data);
-    }
-
-    data->data[id-1] = SDL_const_cast(void*, value);
+    tls_set(thread_local_storage, data);
     return 0;
 }
 
--- a/src/thread/generic/SDL_systls.c	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/generic/SDL_systls.c	Wed Jul 10 18:31:17 2013 -0700
@@ -20,144 +20,19 @@
 */
 
 #include "SDL_config.h"
-#include "SDL_thread.h"
-
-/* This is a generic implementation of thread-local storage which doesn't
-   require additional OS support.
-
-   It is not especially efficient and doesn't clean up thread-local storage
-   as threads exit.  If there is a real OS that doesn't support thread-local
-   storage this implementation should be improved to be production quality.
-*/
-
-#define TLS_ALLOC_CHUNKSIZE 8
-
-typedef struct {
-    int limit;
-    void *data[1];
-} SDL_TLSData;
-
-typedef struct SDL_TLSEntry {
-    SDL_threadID thread;
-    SDL_TLSData *data;
-    struct SDL_TLSEntry *next;
-} SDL_TLSEntry;
-
-static SDL_SpinLock tls_lock;
-static SDL_mutex *tls_mutex;
-static SDL_TLSEntry *thread_local_storage;
-static SDL_atomic_t tls_id;
+#include "../SDL_thread_c.h"
 
 
-static SDL_TLSData *GetTLSData()
-{
-    SDL_threadID thread = SDL_ThreadID();
-    SDL_TLSEntry *entry;
-    SDL_TLSData *data = NULL;
-
-    if (!tls_mutex) {
-        SDL_AtomicLock(&tls_lock);
-        if (!tls_mutex) {
-            tls_mutex = SDL_CreateMutex();
-            if (!tls_mutex) {
-                SDL_AtomicUnlock(&tls_lock);
-                return NULL;
-            }
-        }
-        SDL_AtomicUnlock(&tls_lock);
-    }
-
-    SDL_LockMutex(tls_mutex);
-    for (entry = thread_local_storage; entry; entry = entry->next) {
-        if (entry->thread == thread) {
-            data = entry->data;
-            break;
-        }
-    }
-    SDL_UnlockMutex(tls_mutex);
-
-    return data;
-}
-
-static int SetTLSData(SDL_TLSData *data)
+SDL_TLSData *
+SDL_SYS_GetTLSData()
 {
-    SDL_threadID thread = SDL_ThreadID();
-    SDL_TLSEntry *entry;
-
-    /* GetTLSData() is always called first, so we can assume tls_mutex */
-    SDL_LockMutex(tls_mutex);
-    for (entry = thread_local_storage; entry; entry = entry->next) {
-        if (entry->thread == thread) {
-            entry->data = data;
-            break;
-        }
-    }
-    if (!entry) {
-        entry = (SDL_TLSEntry *)SDL_malloc(sizeof(*entry));
-        if (entry) {
-            entry->thread = thread;
-            entry->data = data;
-            entry->next = thread_local_storage;
-            thread_local_storage = entry;
-        }
-    }
-    SDL_UnlockMutex(tls_mutex);
-
-    if (!entry) {
-        return SDL_OutOfMemory();
-    }
-    return 0;
-}
-
-
-SDL_TLSID
-SDL_TLSCreate()
-{
-    return SDL_AtomicIncRef(&tls_id)+1;
-}
-
-void *
-SDL_TLSGet(SDL_TLSID id)
-{
-    SDL_TLSData *data;
-
-    data = GetTLSData();
-    if (!data || id <= 0 || id > data->limit) {
-        return NULL;
-    }
-    return data->data[id-1];
+    return SDL_Generic_GetTLSData();
 }
 
 int
-SDL_TLSSet(SDL_TLSID id, const void *value)
+SDL_SYS_SetTLSData(SDL_TLSData *data)
 {
-    SDL_TLSData *data;
-
-    if (id <= 0) {
-        return SDL_InvalidParamError(id);
-    }
-
-    data = GetTLSData();
-    if (!data || id > data->limit) {
-        int i, oldlimit, newlimit;
-
-        oldlimit = data ? data->limit : 0;
-        newlimit = (id + TLS_ALLOC_CHUNKSIZE);
-        data = (SDL_TLSData *)SDL_realloc(data, sizeof(*data)+(newlimit-1)*sizeof(void*));
-        if (!data) {
-            return SDL_OutOfMemory();
-        }
-        data->limit = newlimit;
-        for (i = oldlimit; i < newlimit; ++i) {
-            data->data[i] = NULL;
-        }
-        if (SetTLSData(data) != 0) {
-            return -1;
-        }
-    }
-
-    data->data[id-1] = SDL_const_cast(void*, value);
-    return 0;
+    return SDL_Generic_SetTLSData(data);
 }
 
 /* vi: set ts=4 sw=4 expandtab: */
--- a/src/thread/pthread/SDL_systls.c	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/pthread/SDL_systls.c	Wed Jul 10 18:31:17 2013 -0700
@@ -18,83 +18,51 @@
      misrepresented as being the original software.
   3. This notice may not be removed or altered from any source distribution.
 */
-
 #include "SDL_config.h"
 #include "SDL_thread.h"
+#include "../SDL_thread_c.h"
 
 #include <pthread.h>
 
 
-#define TLS_ALLOC_CHUNKSIZE 8
+#define INVALID_PTHREAD_KEY ((pthread_key_t)-1)
 
-typedef struct {
-    int limit;
-    void *data[1];
-} SDL_TLSData;
-
-static SDL_SpinLock tls_lock;
-static pthread_key_t thread_local_storage;
-static SDL_atomic_t tls_id;
+static pthread_key_t thread_local_storage = INVALID_PTHREAD_KEY;
+static SDL_bool generic_local_storage = SDL_FALSE;
 
-
-SDL_TLSID
-SDL_TLSCreate()
+SDL_TLSData *
+SDL_SYS_GetTLSData()
 {
-    if (!thread_local_storage) {
-        SDL_AtomicLock(&tls_lock);
-        if (!thread_local_storage) {
-            if (pthread_key_create(&thread_local_storage, NULL) != 0) {
-                SDL_SetError("pthread_key_create() failed");
-                SDL_AtomicUnlock(&tls_lock);
-                return 0;
+    if (thread_local_storage == INVALID_PTHREAD_KEY && !generic_local_storage) {
+        static SDL_SpinLock lock;
+        SDL_AtomicLock(&lock);
+        if (thread_local_storage == INVALID_PTHREAD_KEY && !generic_local_storage) {
+            pthread_key_t storage;
+            if (pthread_key_create(&storage, NULL) == 0) {
+                SDL_MemoryBarrierRelease();
+                thread_local_storage = storage;
+            } else {
+                generic_local_storage = SDL_TRUE;
             }
         }
-        SDL_AtomicUnlock(&tls_lock);
+        SDL_AtomicUnlock(&lock);
     }
-    return SDL_AtomicIncRef(&tls_id)+1;
-}
-
-void *
-SDL_TLSGet(SDL_TLSID id)
-{
-    SDL_TLSData *data;
-
-    data = (SDL_TLSData *)pthread_getspecific(thread_local_storage);
-    if (!data || id <= 0 || id > data->limit) {
-        return NULL;
+    if (generic_local_storage) {
+        return SDL_Generic_GetTLSData();
     }
-    return data->data[id-1];
+    SDL_MemoryBarrierAcquire();
+    return (SDL_TLSData *)pthread_getspecific(thread_local_storage);
 }
 
 int
-SDL_TLSSet(SDL_TLSID id, const void *value)
+SDL_SYS_SetTLSData(SDL_TLSData *data)
 {
-    SDL_TLSData *data;
-
-    if (!thread_local_storage || id <= 0) {
-        return SDL_InvalidParamError(id);
+    if (generic_local_storage) {
+        return SDL_Generic_SetTLSData(data);
     }
-
-    data = (SDL_TLSData *)pthread_getspecific(thread_local_storage);
-    if (!data || id > data->limit) {
-        int i, oldlimit, newlimit;
-
-        oldlimit = data ? data->limit : 0;
-        newlimit = (id + TLS_ALLOC_CHUNKSIZE);
-        data = (SDL_TLSData *)SDL_realloc(data, sizeof(*data)+(newlimit-1)*sizeof(void*));
-        if (!data) {
-            return SDL_OutOfMemory();
-        }
-        data->limit = newlimit;
-        for (i = oldlimit; i < newlimit; ++i) {
-            data->data[i] = NULL;
-        }
-        if (pthread_setspecific(thread_local_storage, data) != 0) {
-            return SDL_SetError("pthread_setspecific() failed");
-        }
+    if (pthread_setspecific(thread_local_storage, data) != 0) {
+        return SDL_SetError("pthread_setspecific() failed");
     }
-
-    data->data[id-1] = SDL_const_cast(void*, value);
     return 0;
 }
 
--- a/src/thread/windows/SDL_systls.c	Wed Jul 10 02:37:57 2013 -0700
+++ b/src/thread/windows/SDL_systls.c	Wed Jul 10 18:31:17 2013 -0700
@@ -21,83 +21,49 @@
 
 #include "SDL_config.h"
 #include "SDL_thread.h"
+#include "../SDL_thread_c.h"
 
 #if SDL_THREAD_WINDOWS
 
 #include "../../core/windows/SDL_windows.h"
 
 
-#define TLS_ALLOC_CHUNKSIZE 8
-
-typedef struct {
-    int limit;
-    void *data[1];
-} SDL_TLSData;
-
-static SDL_SpinLock tls_lock;
 static DWORD thread_local_storage = TLS_OUT_OF_INDEXES;
-static SDL_atomic_t tls_id;
-
+static SDL_bool generic_local_storage = SDL_FALSE;
 
-SDL_TLSID
-SDL_TLSCreate()
+SDL_TLSData *
+SDL_SYS_GetTLSData()
 {
-    if (thread_local_storage == TLS_OUT_OF_INDEXES) {
-        SDL_AtomicLock(&tls_lock);
-        if (thread_local_storage == TLS_OUT_OF_INDEXES) {
-            thread_local_storage = TlsAlloc();
-            if (thread_local_storage == TLS_OUT_OF_INDEXES) {
-                SDL_SetError("TlsAlloc() failed");
-                SDL_AtomicUnlock(&tls_lock);
-                return 0;
+    if (thread_local_storage == TLS_OUT_OF_INDEXES && !generic_local_storage) {
+        static SDL_SpinLock lock;
+        SDL_AtomicLock(&lock);
+        if (thread_local_storage == TLS_OUT_OF_INDEXES && !generic_local_storage) {
+            DWORD storage = TlsAlloc();
+            if (storage != TLS_OUT_OF_INDEXES) {
+                SDL_MemoryBarrierRelease();
+                thread_local_storage = storage;
+            } else {
+                generic_local_storage = SDL_TRUE;
             }
         }
-        SDL_AtomicUnlock(&tls_lock);
+        SDL_AtomicUnlock(&lock);
     }
-    return SDL_AtomicIncRef(&tls_id)+1;
-}
-
-void *
-SDL_TLSGet(SDL_TLSID id)
-{
-    SDL_TLSData *data;
-
-    data = (SDL_TLSData *)TlsGetValue(thread_local_storage);
-    if (!data || id <= 0 || id > data->limit) {
-        return NULL;
+    if (generic_local_storage) {
+        return SDL_Generic_GetTLSData();
     }
-    return data->data[id-1];
+    SDL_MemoryBarrierAcquire();
+    return (SDL_TLSData *)TlsGetValue(thread_local_storage);
 }
 
 int
-SDL_TLSSet(SDL_TLSID id, const void *value)
+SDL_SYS_SetTLSData(SDL_TLSData *data)
 {
-    SDL_TLSData *data;
-
-    if (thread_local_storage == TLS_OUT_OF_INDEXES || id <= 0) {
-        return SDL_InvalidParamError(id);
+    if (generic_local_storage) {
+        return SDL_Generic_SetTLSData(data);
     }
-
-    data = (SDL_TLSData *)TlsGetValue(thread_local_storage);
-    if (!data || id > data->limit) {
-        int i, oldlimit, newlimit;
-
-        oldlimit = data ? data->limit : 0;
-        newlimit = (id + TLS_ALLOC_CHUNKSIZE);
-        data = (SDL_TLSData *)SDL_realloc(data, sizeof(*data)+(newlimit-1)*sizeof(void*));
-        if (!data) {
-            return SDL_OutOfMemory();
-        }
-        data->limit = newlimit;
-        for (i = oldlimit; i < newlimit; ++i) {
-            data->data[i] = NULL;
-        }
-        if (!TlsSetValue(thread_local_storage, data)) {
-            return SDL_SetError("TlsSetValue() failed");
-        }
+    if (!TlsSetValue(thread_local_storage, data)) {
+        return SDL_SetError("TlsSetValue() failed");
     }
-
-    data->data[id-1] = SDL_const_cast(void*, value);
     return 0;
 }
 
--- a/test/testthread.c	Wed Jul 10 02:37:57 2013 -0700
+++ b/test/testthread.c	Wed Jul 10 18:31:17 2013 -0700
@@ -33,7 +33,7 @@
 int SDLCALL
 ThreadFunc(void *data)
 {
-    SDL_TLSSet(tls, "baby thread");
+    SDL_TLSSet(tls, "baby thread", NULL);
     printf("Started thread %s: My thread id is %lu, thread data = %s\n",
            (char *) data, SDL_ThreadID(), (const char *)SDL_TLSGet(tls));
     while (alive) {
@@ -66,7 +66,7 @@
 
     tls = SDL_TLSCreate();
     SDL_assert(tls);
-    SDL_TLSSet(tls, "main thread");
+    SDL_TLSSet(tls, "main thread", NULL);
     printf("Main thread data initially: %s\n", (const char *)SDL_TLSGet(tls));
 
     alive = 1;