Added SSE version of SDL_FillRect() for 32-bit ARGB surfaces
authorSam Lantinga <slouken@libsdl.org>
Mon, 13 Aug 2007 06:24:56 +0000
changeset 2239 31835fd24b2b
parent 2238 93ea84f77d83
child 2240 eebaa05f8bd9
Added SSE version of SDL_FillRect() for 32-bit ARGB surfaces
src/video/SDL_surface.c
--- a/src/video/SDL_surface.c	Mon Aug 13 03:03:23 2007 +0000
+++ b/src/video/SDL_surface.c	Mon Aug 13 06:24:56 2007 +0000
@@ -587,20 +587,22 @@
     } else {
         switch (dst->format->BytesPerPixel) {
         case 2:
-            for (y = dstrect->h; y; --y) {
-                Uint16 *pixels = (Uint16 *) row;
+            {
                 Uint16 c = (Uint16) color;
                 Uint32 cc = (Uint32) c << 16 | c;
-                int n = dstrect->w;
-                if ((uintptr_t) pixels & 3) {
-                    *pixels++ = c;
-                    n--;
+                for (y = dstrect->h; y; --y) {
+                    Uint16 *pixels = (Uint16 *) row;
+                    int n = dstrect->w;
+                    if ((uintptr_t) pixels & 3) {
+                        *pixels++ = c;
+                        n--;
+                    }
+                    if (n >> 1)
+                        SDL_memset4(pixels, cc, n >> 1);
+                    if (n & 1)
+                        pixels[n - 1] = c;
+                    row += dst->pitch;
                 }
-                if (n >> 1)
-                    SDL_memset4(pixels, cc, n >> 1);
-                if (n & 1)
-                    pixels[n - 1] = c;
-                row += dst->pitch;
             }
             break;
 
@@ -619,6 +621,33 @@
             break;
 
         case 4:
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
+            if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
+                Uint32 cccc[4] __attribute__ ((aligned(16))) = {
+                color, color, color, color};
+                int i, n = dstrect->w / 4;
+                __asm__ __volatile__("	movdqa (%0), %%xmm0\n"::
+                                     "r"(cccc):"memory");
+                for (y = dstrect->h; y; --y) {
+                    Uint8 *pixels = row;
+                    for (i = n / 2; i--;) {
+                        /* *INDENT-OFF* */
+                        __asm__ __volatile__("	prefetchnta 256(%0)\n"
+                                             "	movdqa %%xmm0, (%0)\n"
+                                             "	movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
+                        /* *INDENT-ON* */
+                        pixels += 32;
+                    }
+                    if (n & 1) {
+                        __asm__ __volatile__("	movdqa %%xmm0, (%0)\n"::
+                                             "r"(pixels):"memory");
+                    }
+                    row += dst->pitch;
+                }
+                __asm__ __volatile__("	emms\n"::);
+                break;
+            }
+#endif
             for (y = dstrect->h; y; --y) {
                 SDL_memset4(row, color, dstrect->w);
                 row += dst->pitch;