--- a/src/video/SDL_surface.c Thu Aug 16 02:14:13 2007 +0000
+++ b/src/video/SDL_surface.c Thu Aug 16 05:56:24 2007 +0000
@@ -509,20 +509,220 @@
return 0;
}
-static int
-SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
-{
- /* FIXME: We have to worry about packing order.. *sigh* */
- SDL_SetError("1-bpp rect fill not yet implemented");
- return -1;
+#ifdef __SSE__
+/* *INDENT-OFF* */
+
+#define SSE_BEGIN \
+ DECLARE_ALIGNED(Uint32, cccc[4], 16); \
+ cccc[0] = color; \
+ cccc[1] = color; \
+ cccc[2] = color; \
+ cccc[3] = color; \
+ __m128 c128 = *(__m128 *)cccc;
+
+#define SSE_WORK \
+ for (i = n / 64; i--;) { \
+ _mm_stream_ps((float *)(p+0), c128); \
+ _mm_stream_ps((float *)(p+16), c128); \
+ _mm_stream_ps((float *)(p+32), c128); \
+ _mm_stream_ps((float *)(p+48), c128); \
+ p += 64; \
+ }
+
+#define SSE_END
+
+#define DEFINE_SSE_FILLRECT(bpp, type) \
+static void \
+SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+{ \
+ SSE_BEGIN; \
+ \
+ while (h--) { \
+ int i, n = w * bpp; \
+ Uint8 *p = pixels; \
+ \
+ if (n > 15) { \
+ int adjust = 16 - ((uintptr_t)p & 15); \
+ if (adjust < 16) { \
+ n -= adjust; \
+ adjust /= bpp; \
+ while(adjust--) { \
+ *((type *)p) = (type)color; \
+ p += bpp; \
+ } \
+ } \
+ SSE_WORK; \
+ } \
+ if (n & 63) { \
+ int remainder = (n & 63); \
+ remainder /= bpp; \
+ while(remainder--) { \
+ *((type *)p) = (type)color; \
+ p += bpp; \
+ } \
+ } \
+ pixels += pitch; \
+ } \
+ \
+ SSE_END; \
}
-static int
-SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
+DEFINE_SSE_FILLRECT(1, Uint8)
+DEFINE_SSE_FILLRECT(2, Uint16)
+DEFINE_SSE_FILLRECT(4, Uint32)
+
+/* *INDENT-ON* */
+#endif /* __SSE__ */
+
+#ifdef __MMX__
+/* *INDENT-OFF* */
+
+#define MMX_BEGIN \
+ __m64 c64 = _mm_set_pi32(color, color)
+
+#define MMX_WORK \
+ for (i = n / 64; i--;) { \
+ _mm_stream_pi((__m64 *)(p+0), c64); \
+ _mm_stream_pi((__m64 *)(p+8), c64); \
+ _mm_stream_pi((__m64 *)(p+16), c64); \
+ _mm_stream_pi((__m64 *)(p+24), c64); \
+ _mm_stream_pi((__m64 *)(p+32), c64); \
+ _mm_stream_pi((__m64 *)(p+40), c64); \
+ _mm_stream_pi((__m64 *)(p+48), c64); \
+ _mm_stream_pi((__m64 *)(p+56), c64); \
+ p += 64; \
+ }
+
+#define MMX_END \
+ _mm_empty()
+
+#define DEFINE_MMX_FILLRECT(bpp, type) \
+static void \
+SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+{ \
+ MMX_BEGIN; \
+ \
+ while (h--) { \
+ int i, n = w * bpp; \
+ Uint8 *p = pixels; \
+ \
+ if (n > 7) { \
+ int adjust = 8 - ((uintptr_t)p & 7); \
+ if (adjust < 8) { \
+ n -= adjust; \
+ adjust /= bpp; \
+ while(adjust--) { \
+ *((type *)p) = (type)color; \
+ p += bpp; \
+ } \
+ } \
+ MMX_WORK; \
+ } \
+ if (n & 63) { \
+ int remainder = (n & 63); \
+ remainder /= bpp; \
+ while(remainder--) { \
+ *((type *)p) = (type)color; \
+ p += bpp; \
+ } \
+ } \
+ pixels += pitch; \
+ } \
+ \
+ MMX_END; \
+}
+
+DEFINE_MMX_FILLRECT(1, Uint8)
+DEFINE_MMX_FILLRECT(2, Uint16)
+DEFINE_MMX_FILLRECT(4, Uint32)
+
+/* *INDENT-ON* */
+#endif /* __MMX__ */
+
+static void
+SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
{
- /* FIXME: We have to worry about packing order.. *sigh* */
- SDL_SetError("4-bpp rect fill not yet implemented");
- return -1;
+ while (h--) {
+ int n = w;
+ Uint8 *p = pixels;
+
+ if (n > 3) {
+ switch ((uintptr_t) p & 3) {
+ case 1:
+ *p++ = (Uint8) color;
+ --n;
+ case 2:
+ *p++ = (Uint8) color;
+ --n;
+ case 3:
+ *p++ = (Uint8) color;
+ --n;
+ }
+ SDL_memset4(p, color, (n >> 2));
+ }
+ if (n & 3) {
+ p += (n & ~3);
+ switch (n & 3) {
+ case 3:
+ *p++ = (Uint8) color;
+ case 2:
+ *p++ = (Uint8) color;
+ case 1:
+ *p++ = (Uint8) color;
+ }
+ }
+ pixels += pitch;
+ }
+}
+
+static void
+SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+ while (h--) {
+ int n = w;
+ Uint16 *p = (Uint16 *) pixels;
+
+ if (n > 1) {
+ if ((uintptr_t) p & 2) {
+ *p++ = (Uint16) color;
+ --n;
+ }
+ SDL_memset4(p, color, (n >> 1));
+ }
+ if (n & 1) {
+ p[n - 1] = (Uint16) color;
+ }
+ pixels += pitch;
+ }
+}
+
+static void
+SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+ Uint8 r = (Uint8) (color & 0xFF);
+ Uint8 g = (Uint8) ((color >> 8) & 0xFF);
+ Uint8 b = (Uint8) ((color >> 16) & 0xFF);
+
+ while (h--) {
+ int n = w;
+ Uint8 *p = pixels;
+
+ while (n--) {
+ *p++ = r;
+ *p++ = g;
+ *p++ = b;
+ }
+ pixels += pitch;
+ }
+}
+
+static void
+SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+ while (h--) {
+ SDL_memset4(pixels, color, w);
+ pixels += pitch;
+ }
}
/*
@@ -531,23 +731,12 @@
int
SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
{
- int x, y;
- Uint8 *row;
+ Uint8 *pixels;
/* This function doesn't work on surfaces < 8 bpp */
if (dst->format->BitsPerPixel < 8) {
- switch (dst->format->BitsPerPixel) {
- case 1:
- return SDL_FillRect1(dst, dstrect, color);
- break;
- case 4:
- return SDL_FillRect4(dst, dstrect, color);
- break;
- default:
- SDL_SetError("Fill rect on unsupported surface format");
- return (-1);
- break;
- }
+ SDL_SetError("Fill rect on unsupported surface format");
+ return (-1);
}
/* If 'dstrect' == NULL, then fill the whole surface */
@@ -564,97 +753,83 @@
if (SDL_LockSurface(dst) != 0) {
return (-1);
}
- row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
+
+ pixels =
+ (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
dstrect->x * dst->format->BytesPerPixel;
- if (dst->format->palette || (color == 0)) {
- x = dstrect->w * dst->format->BytesPerPixel;
-#ifndef __MACOSX__ /* memset() is optimized on Mac OS X */
- if (!color && !((uintptr_t) row & 3) && !(x & 3)
- && !(dst->pitch & 3)) {
- int n = x >> 2;
- for (y = dstrect->h; y; --y) {
- SDL_memset4(row, 0, n);
- row += dst->pitch;
- }
- } else
-#endif /* !__MACOSX__ */
+
+ switch (dst->format->BytesPerPixel) {
+ case 1:
{
- for (y = dstrect->h; y; y--) {
- SDL_memset(row, color, x);
- row += dst->pitch;
+ color |= (color << 8);
+ color |= (color << 16);
+#ifdef __SSE__
+ if (SDL_HasSSE()) {
+ SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w,
+ dstrect->h);
+ break;
}
- }
- } else {
- switch (dst->format->BytesPerPixel) {
- case 2:
- {
- Uint16 c = (Uint16) color;
- Uint32 cc = (Uint32) c << 16 | c;
- for (y = dstrect->h; y; --y) {
- Uint16 *pixels = (Uint16 *) row;
- int n = dstrect->w;
- if ((uintptr_t) pixels & 3) {
- *pixels++ = c;
- n--;
- }
- if (n >> 1)
- SDL_memset4(pixels, cc, n >> 1);
- if (n & 1)
- pixels[n - 1] = c;
- row += dst->pitch;
- }
+#endif
+#ifdef __MMX__
+ if (SDL_HasMMX()) {
+ SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w,
+ dstrect->h);
+ break;
}
+#endif
+ SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h);
break;
-
- case 3:
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
- color <<= 8;
-#endif
- for (y = dstrect->h; y; --y) {
- Uint8 *pixels = row;
- for (x = dstrect->w; x; --x) {
- SDL_memcpy(pixels, &color, 3);
- pixels += 3;
- }
- row += dst->pitch;
- }
- break;
+ }
- case 4:
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
- if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
- Uint32 cccc[4] __attribute__ ((aligned(16))) = {
- color, color, color, color};
- int i, n = dstrect->w / 4;
- __asm__ __volatile__(" movdqa (%0), %%xmm0\n"::
- "r"(cccc):"memory");
- for (y = dstrect->h; y; --y) {
- Uint8 *pixels = row;
- for (i = n / 2; i--;) {
- /* *INDENT-OFF* */
- __asm__ __volatile__(" prefetchnta 256(%0)\n"
- " movdqa %%xmm0, (%0)\n"
- " movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
- /* *INDENT-ON* */
- pixels += 32;
- }
- if (n & 1) {
- __asm__ __volatile__(" movdqa %%xmm0, (%0)\n"::
- "r"(pixels):"memory");
- }
- row += dst->pitch;
- }
- __asm__ __volatile__(" emms\n"::);
+ case 2:
+ {
+ color |= (color << 16);
+#ifdef __SSE__
+ if (SDL_HasSSE()) {
+ SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w,
+ dstrect->h);
break;
}
#endif
- for (y = dstrect->h; y; --y) {
- SDL_memset4(row, color, dstrect->w);
- row += dst->pitch;
+#ifdef __MMX__
+ if (SDL_HasMMX()) {
+ SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w,
+ dstrect->h);
+ break;
}
+#endif
+ SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h);
+ break;
+ }
+
+ case 3:
+ /* 24-bit RGB is a slow path, at least for now. */
+ {
+ SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h);
+ break;
+ }
+
+ case 4:
+ {
+#ifdef __SSE__
+ if (SDL_HasSSE()) {
+ SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w,
+ dstrect->h);
+ break;
+ }
+#endif
+#ifdef __MMX__
+ if (SDL_HasMMX()) {
+ SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w,
+ dstrect->h);
+ break;
+ }
+#endif
+ SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h);
break;
}
}
+
SDL_UnlockSurface(dst);
/* We're done! */