--- a/src/video/SDL_blit_A.c Thu Aug 16 21:54:26 2007 +0000
+++ b/src/video/SDL_blit_A.c Thu Aug 16 22:18:53 2007 +0000
@@ -24,41 +24,6 @@
#include "SDL_video.h"
#include "SDL_blit.h"
-/*
- In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
- Checking if _mm_free is #defined in malloc.h is is the only way to
- determine if the Processor Pack is installed, as far as I can tell.
-*/
-
-#if SDL_ASSEMBLY_ROUTINES
-# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-# define MMX_ASMBLIT 1
-# define GCC_ASMBLIT 1
-# elif defined(_MSC_VER) && defined(_M_IX86)
-# if (_MSC_VER <= 1200)
-# include <malloc.h>
-# if defined(_mm_free)
-# define HAVE_MMINTRIN_H 1
-# endif
-# else /* Visual Studio > VC6 always has mmintrin.h */
-# define HAVE_MMINTRIN_H 1
-# endif
-# if HAVE_MMINTRIN_H
-# define MMX_ASMBLIT 1
-# define MSVC_ASMBLIT 1
-# endif
-# endif
-#endif /* SDL_ASSEMBLY_ROUTINES */
-
-/* Function to check the CPU flags */
-#include "SDL_cpuinfo.h"
-#if GCC_ASMBLIT
-#include "mmx.h"
-#elif MSVC_ASMBLIT
-#include <mmintrin.h>
-#include <mm3dnow.h>
-#endif
-
/* Functions to perform alpha blended blitting */
/* N->1 blending with per-surface alpha */
@@ -232,239 +197,8 @@
}
}
-#if GCC_ASMBLIT
-/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
-static void
-BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
-{
- int width = info->d_width;
- int height = info->d_height;
- Uint32 *srcp = (Uint32 *) info->s_pixels;
- int srcskip = info->s_skip >> 2;
- Uint32 *dstp = (Uint32 *) info->d_pixels;
- int dstskip = info->d_skip >> 2;
- Uint32 dalpha = info->dst->Amask;
- Uint8 load[8];
-
- *(Uint64 *) load = 0x00fefefe00fefefeULL; /* alpha128 mask */
- movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
- *(Uint64 *) load = 0x0001010100010101ULL; /* !alpha128 mask */
- movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
- movd_m2r(dalpha, mm7); /* dst alpha mask */
- punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
- while (height--) {
- /* *INDENT-OFF* */
- DUFFS_LOOP_DOUBLE2(
- {
- Uint32 s = *srcp++;
- Uint32 d = *dstp;
- *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
- + (s & d & 0x00010101)) | dalpha;
- },{
- movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
- movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
-
- movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
- movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
-
- pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
- pand_r2r(mm4, mm5); /* src & mask -> mm5 */
- paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
- pand_r2r(mm1, mm2); /* src & dst -> mm2 */
- psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
- pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
- paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
-
- por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
- movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
- dstp += 2;
- srcp += 2;
- }, width);
- /* *INDENT-ON* */
- srcp += srcskip;
- dstp += dstskip;
- }
- emms();
-}
-
-/* fast RGB888->(A)RGB888 blending with surface alpha */
-static void
-BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
-{
- SDL_PixelFormat *df = info->dst;
- unsigned alpha = info->src->alpha;
-
- if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
- /* only call a128 version when R,G,B occupy lower bits */
- BlitRGBtoRGBSurfaceAlpha128MMX(info);
- } else {
- int width = info->d_width;
- int height = info->d_height;
- Uint32 *srcp = (Uint32 *) info->s_pixels;
- int srcskip = info->s_skip >> 2;
- Uint32 *dstp = (Uint32 *) info->d_pixels;
- int dstskip = info->d_skip >> 2;
-
- pxor_r2r(mm5, mm5); /* 0 -> mm5 */
- /* form the alpha mult */
- movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
- punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
- punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
- alpha =
- (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
- Bshift);
- movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
- punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
- pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
- /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
- movd_m2r(df->Amask, mm7); /* dst alpha mask */
- punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
-
- while (height--) {
- /* *INDENT-OFF* */
- DUFFS_LOOP_DOUBLE2({
- /* One Pixel Blend */
- movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
- movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
- punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
- punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
-
- psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
- pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
- psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
- paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
-
- packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
- por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
- movd_r2m(mm2, *dstp);/* mm2 -> pixel */
- ++srcp;
- ++dstp;
- },{
- /* Two Pixels Blend */
- movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
- movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
- movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
- movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
+#ifdef __MMX__
- punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
- punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
- punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
- punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
-
- psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
- pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
- psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
- paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
-
- psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
- pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
- psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
- paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
-
- packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
- por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
-
- movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
-
- srcp += 2;
- dstp += 2;
- }, width);
- /* *INDENT-ON* */
- srcp += srcskip;
- dstp += dstskip;
- }
- emms();
- }
-}
-
-/* fast ARGB888->(A)RGB888 blending with pixel alpha */
-static void
-BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
-{
- int width = info->d_width;
- int height = info->d_height;
- Uint32 *srcp = (Uint32 *) info->s_pixels;
- int srcskip = info->s_skip >> 2;
- Uint32 *dstp = (Uint32 *) info->d_pixels;
- int dstskip = info->d_skip >> 2;
- SDL_PixelFormat *sf = info->src;
- Uint32 amask = sf->Amask;
-
- pxor_r2r(mm6, mm6); /* 0 -> mm6 */
- /* form multiplication mask */
- movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
- punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
- pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
- movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
- pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
- /* form channel masks */
- movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
- packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
- packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
- pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
- /* get alpha channel shift */
- /* *INDENT-OFF* */
- __asm__ __volatile__ (
- "movd %0, %%mm5"
- : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
- /* *INDENT-ON* */
-
- while (height--) {
- /* *INDENT-OFF* */
- DUFFS_LOOP4({
- Uint32 alpha = *srcp & amask;
- /* FIXME: Here we special-case opaque alpha since the
- compositioning used (>>8 instead of /255) doesn't handle
- it correctly. Also special-case alpha=0 for speed?
- Benchmark this! */
- if(alpha == 0) {
- /* do nothing */
- } else if(alpha == amask) {
- /* opaque alpha -- copy RGB, keep dst alpha */
- /* using MMX here to free up regular registers for other things */
- movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
- movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
- pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
- pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
- por_r2r(mm1, mm2); /* src | dst -> mm2 */
- movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
- } else {
- movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
- punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
-
- movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
- punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
-
- __asm__ __volatile__ (
- "movd %0, %%mm4"
- : : "r" (alpha) ); /* 0000A000 -> mm4 */
- psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
- punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
- punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
- pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
-
- /* blend */
- psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
- pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
- psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
- paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
-
- packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
- movd_r2m(mm2, *dstp);/* mm2 -> dst */
- }
- ++srcp;
- ++dstp;
- }, width);
- /* *INDENT-ON* */
- srcp += srcskip;
- dstp += dstskip;
- }
- emms();
-}
-
-/* End GCC_ASMBLIT */
-
-#elif MSVC_ASMBLIT
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
static void
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
@@ -637,9 +371,9 @@
__m64 src1, dst1, mm_alpha, mm_zero, dmask;
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
- /* *INDENT-OFF* */
- multmask = ~(0xFFFFI64 << (ashift * 2));
- /* *INDENT-ON* */
+ multmask = 0xFFFF;
+ multmask <<= (ashift * 2);
+ multmask = ~multmask;
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
while (height--) {
@@ -683,9 +417,7 @@
_mm_empty();
}
-/* End MSVC_ASMBLIT */
-
-#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
+#endif /* __MMX__ */
#if SDL_ALTIVEC_BLITTERS
#if __MWERKS__
@@ -1639,123 +1371,7 @@
}
}
-#if GCC_ASMBLIT
-/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
-static void
-BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
-{
- int width = info->d_width;
- int height = info->d_height;
- Uint32 *srcp = (Uint32 *) info->s_pixels;
- int srcskip = info->s_skip >> 2;
- Uint32 *dstp = (Uint32 *) info->d_pixels;
- int dstskip = info->d_skip >> 2;
- SDL_PixelFormat *sf = info->src;
- Uint32 amask = sf->Amask;
-
- __asm__(
- /* make mm6 all zeros. */
- "pxor %%mm6, %%mm6\n"
- /* Make a mask to preserve the alpha. */
- "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
- "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
- "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
- "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
- "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
- /* form channel masks */
- "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
- "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
- "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
- "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
- /* get alpha channel shift */
- "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
- : /* nothing */ : "rm"(amask), "rm"((Uint32) sf->Ashift));
-
- while (height--) {
-
- /* *INDENT-OFF* */
- DUFFS_LOOP4({
- Uint32 alpha;
-
- __asm__ (
- "prefetch 64(%0)\n"
- "prefetch 64(%1)\n"
- : : "r" (srcp), "r" (dstp) );
-
- alpha = *srcp & amask;
- /* FIXME: Here we special-case opaque alpha since the
- compositioning used (>>8 instead of /255) doesn't handle
- it correctly. Also special-case alpha=0 for speed?
- Benchmark this! */
- if(alpha == 0) {
- /* do nothing */
- }
- else if(alpha == amask) {
- /* opaque alpha -- copy RGB, keep dst alpha */
- /* using MMX here to free up regular registers for other things */
- __asm__ (
- "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
- "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
- "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
- "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
- "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
- "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
-
- : : "r" (srcp), "r" (dstp) );
- }
-
- else {
- __asm__ (
- /* load in the source, and dst. */
- "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
- "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
-
- /* Move the src alpha into mm2 */
-
- /* if supporting pshufw */
- /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
- /*"psrlw $8, %%mm2\n" */
-
- /* else: */
- "movd %2, %%mm2\n"
- "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
- "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
- "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
- "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
-
- /* move the colors into words. */
- "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
- "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
-
- /* src - dst */
- "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
-
- /* A * (src-dst) */
- "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
- "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
- "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
-
- "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
-
- "movd %%mm0, (%1)\n" /* result in mm0 */
-
- : : "r" (srcp), "r" (dstp), "r" (alpha) );
-
- }
- ++srcp;
- ++dstp;
- }, width);
- /* *INDENT-ON* */
- srcp += srcskip;
- dstp += dstskip;
- }
-
- __asm__("emms\n":);
-}
-
-/* End GCC_ASMBLIT*/
-
-#elif MSVC_ASMBLIT
+#ifdef __MMX__
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
static void
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
@@ -1775,9 +1391,9 @@
__m64 src1, dst1, mm_alpha, mm_zero, dmask;
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
- /* *INDENT-OFF* */
- multmask = ~(0xFFFFI64 << (ashift * 2));
- /* *INDENT-ON* */
+ multmask = 0xFFFF;
+ multmask <<= (ashift * 2);
+ multmask = ~multmask;
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
while (height--) {
@@ -1826,9 +1442,7 @@
_mm_empty();
}
-/* End MSVC_ASMBLIT */
-
-#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
+#endif /* __MMX__ */
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
@@ -1940,299 +1554,8 @@
}
}
-#if GCC_ASMBLIT
-/* fast RGB565->RGB565 blending with surface alpha */
-static void
-Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
-{
- unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
- if (alpha == 128) {
- Blit16to16SurfaceAlpha128(info, 0xf7de);
- } else {
- int width = info->d_width;
- int height = info->d_height;
- Uint16 *srcp = (Uint16 *) info->s_pixels;
- int srcskip = info->s_skip >> 1;
- Uint16 *dstp = (Uint16 *) info->d_pixels;
- int dstskip = info->d_skip >> 1;
- Uint32 s, d;
- Uint8 load[8];
-
- alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
- *(Uint64 *) load = alpha;
- alpha >>= 3; /* downscale alpha to 5 bits */
-
- movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
- punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
- punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
- /* position alpha to allow for mullo and mulhi on diff channels
- to reduce the number of operations */
- psllq_i2r(3, mm0);
-
- /* Setup the 565 color channel masks */
- *(Uint64 *) load = 0x07E007E007E007E0ULL;
- movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
- *(Uint64 *) load = 0x001F001F001F001FULL;
- movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
- while (height--) {
- /* *INDENT-OFF* */
- DUFFS_LOOP_QUATRO2(
- {
- s = *srcp++;
- d = *dstp;
- /*
- * shift out the middle component (green) to
- * the high 16 bits, and process all three RGB
- * components at the same time.
- */
- s = (s | s << 16) & 0x07e0f81f;
- d = (d | d << 16) & 0x07e0f81f;
- d += (s - d) * alpha >> 5;
- d &= 0x07e0f81f;
- *dstp++ = d | d >> 16;
- },{
- s = *srcp++;
- d = *dstp;
- /*
- * shift out the middle component (green) to
- * the high 16 bits, and process all three RGB
- * components at the same time.
- */
- s = (s | s << 16) & 0x07e0f81f;
- d = (d | d << 16) & 0x07e0f81f;
- d += (s - d) * alpha >> 5;
- d &= 0x07e0f81f;
- *dstp++ = d | d >> 16;
- s = *srcp++;
- d = *dstp;
- /*
- * shift out the middle component (green) to
- * the high 16 bits, and process all three RGB
- * components at the same time.
- */
- s = (s | s << 16) & 0x07e0f81f;
- d = (d | d << 16) & 0x07e0f81f;
- d += (s - d) * alpha >> 5;
- d &= 0x07e0f81f;
- *dstp++ = d | d >> 16;
- },{
- movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
- movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
-
- /* red -- does not need a mask since the right shift clears
- the uninteresting bits */
- movq_r2r(mm2, mm5); /* src -> mm5 */
- movq_r2r(mm3, mm6); /* dst -> mm6 */
- psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
- psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
-
- /* blend */
- psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
- pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
- /* alpha used is actually 11 bits
- 11 + 5 = 16 bits, so the sign bits are lost */
- psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
- paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
- psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
-
- movq_r2r(mm6, mm1); /* save new reds in dsts */
-
- /* green -- process the bits in place */
- movq_r2r(mm2, mm5); /* src -> mm5 */
- movq_r2r(mm3, mm6); /* dst -> mm6 */
- pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
- pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
-
- /* blend */
- psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
- pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
- /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
- bits are gone and the sign bits present */
- psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
- paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-
- por_r2r(mm6, mm1); /* save new greens in dsts */
-
- /* blue */
- movq_r2r(mm2, mm5); /* src -> mm5 */
- movq_r2r(mm3, mm6); /* dst -> mm6 */
- pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
- pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
-
- /* blend */
- psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
- pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
- /* 11 + 5 = 16 bits, so the sign bits are lost and
- the interesting bits will need to be MASKed */
- psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
- paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
- pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
-
- por_r2r(mm6, mm1); /* save new blues in dsts */
-
- movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
-
- srcp += 4;
- dstp += 4;
- }, width);
- /* *INDENT-ON* */
- srcp += srcskip;
- dstp += dstskip;
- }
- emms();
- }
-}
+#ifdef __MMX__
-/* fast RGB555->RGB555 blending with surface alpha */
-static void
-Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
-{
- unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
- if (alpha == 128) {
- Blit16to16SurfaceAlpha128(info, 0xfbde);
- } else {
- int width = info->d_width;
- int height = info->d_height;
- Uint16 *srcp = (Uint16 *) info->s_pixels;
- int srcskip = info->s_skip >> 1;
- Uint16 *dstp = (Uint16 *) info->d_pixels;
- int dstskip = info->d_skip >> 1;
- Uint32 s, d;
- Uint8 load[8];
-
- alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
- *(Uint64 *) load = alpha;
- alpha >>= 3; /* downscale alpha to 5 bits */
-
- movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
- punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
- punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
- /* position alpha to allow for mullo and mulhi on diff channels
- to reduce the number of operations */
- psllq_i2r(3, mm0);
-
- /* Setup the 555 color channel masks */
- *(Uint64 *) load = 0x03E003E003E003E0ULL;
- movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
- *(Uint64 *) load = 0x001F001F001F001FULL;
- movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
- while (height--) {
- /* *INDENT-OFF* */
- DUFFS_LOOP_QUATRO2(
- {
- s = *srcp++;
- d = *dstp;
- /*
- * shift out the middle component (green) to
- * the high 16 bits, and process all three RGB
- * components at the same time.
- */
- s = (s | s << 16) & 0x03e07c1f;
- d = (d | d << 16) & 0x03e07c1f;
- d += (s - d) * alpha >> 5;
- d &= 0x03e07c1f;
- *dstp++ = d | d >> 16;
- },{
- s = *srcp++;
- d = *dstp;
- /*
- * shift out the middle component (green) to
- * the high 16 bits, and process all three RGB
- * components at the same time.
- */
- s = (s | s << 16) & 0x03e07c1f;
- d = (d | d << 16) & 0x03e07c1f;
- d += (s - d) * alpha >> 5;
- d &= 0x03e07c1f;
- *dstp++ = d | d >> 16;
- s = *srcp++;
- d = *dstp;
- /*
- * shift out the middle component (green) to
- * the high 16 bits, and process all three RGB
- * components at the same time.
- */
- s = (s | s << 16) & 0x03e07c1f;
- d = (d | d << 16) & 0x03e07c1f;
- d += (s - d) * alpha >> 5;
- d &= 0x03e07c1f;
- *dstp++ = d | d >> 16;
- },{
- movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
- movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
-
- /* red -- process the bits in place */
- psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
- /* by reusing the GREEN mask we free up another mmx
- register to accumulate the result */
-
- movq_r2r(mm2, mm5); /* src -> mm5 */
- movq_r2r(mm3, mm6); /* dst -> mm6 */
- pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
- pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
-
- /* blend */
- psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
- pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
- /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
- cleared by a MASK below */
- psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
- paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
- pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
-
- psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
-
- movq_r2r(mm6, mm1); /* save new reds in dsts */
-
- /* green -- process the bits in place */
- movq_r2r(mm2, mm5); /* src -> mm5 */
- movq_r2r(mm3, mm6); /* dst -> mm6 */
- pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
- pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
-
- /* blend */
- psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
- pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
- /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
- bits are gone and the sign bits present */
- psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
- paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-
- por_r2r(mm6, mm1); /* save new greens in dsts */
-
- /* blue */
- movq_r2r(mm2, mm5); /* src -> mm5 */
- movq_r2r(mm3, mm6); /* dst -> mm6 */
- pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
- pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
-
- /* blend */
- psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
- pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
- /* 11 + 5 = 16 bits, so the sign bits are lost and
- the interesting bits will need to be MASKed */
- psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
- paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
- pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
-
- por_r2r(mm6, mm1); /* save new blues in dsts */
-
- movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
-
- srcp += 4;
- dstp += 4;
- }, width);
- /* *INDENT-ON* */
- srcp += srcskip;
- dstp += dstskip;
- }
- emms();
- }
-}
-
-/* End GCC_ASMBLIT */
-
-#elif MSVC_ASMBLIT
/* fast RGB565->RGB565 blending with surface alpha */
static void
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
@@ -2507,7 +1830,8 @@
_mm_empty();
}
}
-#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
+
+#endif /* __MMX__ */
/* fast RGB565->RGB565 blending with surface alpha */
static void
@@ -2852,14 +2176,14 @@
case 2:
if (surface->map->identity) {
if (df->Gmask == 0x7e0) {
-#if MMX_ASMBLIT
+#ifdef __MMX__
if (SDL_HasMMX())
return Blit565to565SurfaceAlphaMMX;
else
#endif
return Blit565to565SurfaceAlpha;
} else if (df->Gmask == 0x3e0) {
-#if MMX_ASMBLIT
+#ifdef __MMX__
if (SDL_HasMMX())
return Blit555to555SurfaceAlphaMMX;
else
@@ -2873,7 +2197,7 @@
if (sf->Rmask == df->Rmask
&& sf->Gmask == df->Gmask
&& sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
-#if MMX_ASMBLIT
+#ifdef __MMX__
if (sf->Rshift % 8 == 0
&& sf->Gshift % 8 == 0
&& sf->Bshift % 8 == 0 && SDL_HasMMX())
@@ -2928,7 +2252,7 @@
if (sf->Rmask == df->Rmask
&& sf->Gmask == df->Gmask
&& sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
-#if MMX_ASMBLIT
+#ifdef __MMX__
if (sf->Rshift % 8 == 0
&& sf->Gshift % 8 == 0
&& sf->Bshift % 8 == 0