Fixed bug #15
authorSam Lantinga <slouken@libsdl.org>
Wed, 15 Mar 2006 15:39:29 +0000
changeset 1542 a8bf1aa21020
parent 1541 157001382dfd
child 1543 98f9b16f565c
Fixed bug #15 SDL_blit_A.mmx-speed.patch.txt -- Speed improvements and a bugfix for the current GCC inline mmx asm code: - Changed some ops and removed some resulting useless ones. - Added some instruction parallelism (some gain) The resulting speed on my Xeon improved upto 35% depending on the function (measured in fps). - Fixed a bug where BlitRGBtoRGBSurfaceAlphaMMX() was setting the alpha component on the destination surfaces (to opaque-alpha) even when the surface had none. SDL_blit_A.mmx-msvc.patch.txt -- MSVC mmx intrinsics version of the same GCC asm code. MSVC compiler tries to parallelize the code and to avoid register stalls, but does not always do a very good job. Per-surface blending MSVC functions run quite a bit faster than their pure-asm counterparts (upto 55% faster for 16bit ones), but the per-pixel blending runs somewhat slower than asm. - BlitRGBtoRGBSurfaceAlphaMMX and BlitRGBtoRGBPixelAlphaMMX (and all variants) can now also handle formats other than (A)RGB8888. Formats like RGBA8888 and some quite exotic ones are allowed -- like RAGB8888, or actually anything having channels aligned on 8bit boundary and full 8bit alpha (for per-pixel alpha blending). The performance cost of this change is virtually 0 for per-surface alpha blending (no extra ops inside the loop) and a single non-MMX op inside the loop for per-pixel blending. In testing, the per-pixel alpha blending takes a ~2% performance hit, but it still runs much faster than the current code in CVS. If necessary, a separate function with this functionality can be made. This code requires Processor Pack for VC6.
src/video/SDL_blit_A.c
--- a/src/video/SDL_blit_A.c	Wed Mar 15 05:52:31 2006 +0000
+++ b/src/video/SDL_blit_A.c	Wed Mar 15 15:39:29 2006 +0000
@@ -24,14 +24,23 @@
 #include "SDL_video.h"
 #include "SDL_blit.h"
 
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
+#if SDL_ASSEMBLY_ROUTINES
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 #define MMX_ASMBLIT 1
+#define GCC_ASMBLIT 1
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
+#define MMX_ASMBLIT 1
+#define MSVC_ASMBLIT 1
 #endif
+#endif /* SDL_ASSEMBLY_ROUTINES */
 
 /* Function to check the CPU flags */
 #include "SDL_cpuinfo.h"
-#if MMX_ASMBLIT
+#if GCC_ASMBLIT
 #include "mmx.h"
+#elif MSVC_ASMBLIT
+#include <mmintrin.h>
+#include <mm3dnow.h>
 #endif
 
 /* Functions to perform alpha blended blitting */
@@ -198,7 +207,7 @@
 	}
 }
 
-#if MMX_ASMBLIT
+#if GCC_ASMBLIT
 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
 {
@@ -208,43 +217,44 @@
 	int srcskip = info->s_skip >> 2;
 	Uint32 *dstp = (Uint32 *)info->d_pixels;
 	int dstskip = info->d_skip >> 2;
-        Uint8 load[8];
-  
-        *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
-        movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
-        *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
-        movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
-        *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */
-        movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
+	Uint32 dalpha = info->dst->Amask;
+	Uint8 load[8];
+
+	*(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
+	movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
+	*(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
+	movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
+	movd_m2r(dalpha, mm7); /* dst alpha mask */
+	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
 	while(height--) {
-            DUFFS_LOOP_DOUBLE2(
-            {
-		    Uint32 s = *srcp++;
-		    Uint32 d = *dstp;
-		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
-			       + (s & d & 0x00010101)) | 0xff000000;
-            },{
-	            movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
-	            movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
-	      
-	            movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
-	            movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
-		
-	            pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
-	            pand_r2r(mm4, mm5); /* src & mask -> mm5 */
-	            paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
-	            psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
-	
-	            pand_r2r(mm1, mm2); /* src & dst -> mm2 */
-	            pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
-	            paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
-	            por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
-	            movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
-	            dstp += 2;
-	            srcp += 2;
-            }, width);
-	    srcp += srcskip;
-	    dstp += dstskip;
+		DUFFS_LOOP_DOUBLE2(
+		{
+			Uint32 s = *srcp++;
+			Uint32 d = *dstp;
+			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
+				   + (s & d & 0x00010101)) | dalpha;
+		},{
+			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
+			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
+
+			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
+			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
+
+			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
+			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
+			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
+			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
+			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
+			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
+			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
+			
+			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
+			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
+			dstp += 2;
+			srcp += 2;
+		}, width);
+		srcp += srcskip;
+		dstp += dstskip;
 	}
 	emms();
 }
@@ -252,8 +262,11 @@
 /* fast RGB888->(A)RGB888 blending with surface alpha */
 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
 {
+	SDL_PixelFormat* df = info->dst;
 	unsigned alpha = info->src->alpha;
-	if(alpha == 128) {
+
+	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
+			/* only call a128 version when R,G,B occupy lower bits */
 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
 	} else {
 		int width = info->d_width;
@@ -262,75 +275,68 @@
 		int srcskip = info->s_skip >> 2;
 		Uint32 *dstp = (Uint32 *)info->d_pixels;
 		int dstskip = info->d_skip >> 2;
-                Uint8 load[8] = {alpha, alpha, alpha, alpha,
-    					alpha, alpha, alpha, alpha};
-					
-                movq_m2r(*load, mm4); /* alpha -> mm4 */
-		*(Uint64 *)load = 0x00FF00FF00FF00FFULL;
-                movq_m2r(*load, mm3); /* mask -> mm3 */
-		pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */
-		*(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */
-		movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
+
+		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
+		/* form the alpha mult */
+		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
+		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
+		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
+		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
+		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
+		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
+		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
+			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
+		movd_m2r(df->Amask, mm7); /* dst alpha mask */
+		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
 		
 		while(height--) {
 			DUFFS_LOOP_DOUBLE2({
 				/* One Pixel Blend */
-	                        movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
-                                punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
-                                pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
-			  
-	                        movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
-			        movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/
-                                punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
-                                pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
-			  
-                                psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
-	                        pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
-	                        psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
-	                        paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
-	                        pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
-	                        packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
-	                        por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
-			        movd_r2m(mm2, *dstp);/* mm2 -> Pixel */
+				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
+				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
+				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
+				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
+
+				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
+				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
+				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
+				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
+
+				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
+				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
+				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
 				++srcp;
 				++dstp;
 			},{
-			        /* Two Pixels Blend */
+				/* Two Pixels Blend */
 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
-			        movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
-                                punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */
-			        pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */
-			        punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */
-	                        pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */
-	
-	                        movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
-	                        movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */
-			        movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
-                                punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */
-	                        punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */
-                                pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */
-	                  
-                                psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
-	                        pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
-			        pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */
-			        psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
-			        psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
-	                        pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
-				paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
-	                        psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */
-				pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
-	                        paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
-	                        pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */
-	                        packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
-	                        packuswb_r2r(mm6, mm6);  /* ARGBARGB -> mm6 */
-	                        psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */
-	                        psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */
-	                        por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */				
-				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
-                                movq_r2m(mm2, *dstp);/* mm2 -> 2 x Pixel */
-				srcp += 2;
-				dstp += 2;
-			}, width);
+				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
+				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
+				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
+
+				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
+				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
+				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
+				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
+
+				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
+				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
+				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
+				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
+
+				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
+				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
+				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
+				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
+
+				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
+				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
+				
+				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
+
+  				srcp += 2;
+  				dstp += 2;
+  			}, width);
 			srcp += srcskip;
 			dstp += dstskip;
 		}
@@ -347,62 +353,65 @@
 	int srcskip = info->s_skip >> 2;
 	Uint32 *dstp = (Uint32 *)info->d_pixels;
 	int dstskip = info->d_skip >> 2;
-        Uint32 alpha = 0;
-        Uint8 load[8];
-	                
-	*(Uint64 *)load = 0x00FF00FF00FF00FFULL;
-        movq_m2r(*load, mm3); /* mask -> mm2 */
-	*(Uint64 *)load = 0x00FF000000000000ULL;
-        movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */
-        *(Uint64 *)load = 0x00FFFFFF00FFFFFFULL;
-        movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */
-        *(Uint64 *)load = 0xFF000000FF000000ULL;
-        movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */
+	SDL_PixelFormat* sf = info->src;
+	Uint32 amask = sf->Amask;
+
+	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
+	/* form multiplication mask */
+	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
+	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
+	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
+	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
+	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
+	/* form channel masks */
+	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
+	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
+	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
+	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
+	/* get alpha channel shift */
+	movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */
+
 	while(height--) {
 	    DUFFS_LOOP4({
-	        alpha = *srcp;
-	        alpha >>= 24;
+		Uint32 alpha = *srcp & amask;
 		/* FIXME: Here we special-case opaque alpha since the
-		   compositioning used (>>8 instead of /255) doesn't handle
-		   it correctly. Also special-case alpha=0 for speed?
-		   Benchmark this! */
-		if(alpha) {   
-		  if(alpha == SDL_ALPHA_OPAQUE) {
-		    movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
-		    movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
-		    pand_r2r(mm0, mm1);
-		    pand_r2r(mm6, mm2);
-		    por_r2r(mm1, mm2);
-		    movd_r2m(mm2, (*dstp));
-		  } else {
-		    movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
-                    punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
-                    pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
-			  
-	            movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
-                    punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
-                    pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
-		
-		    movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */
-		    pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */
-		    psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/
-		    
-		    movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */
-		    psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */
-		    punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
-                    punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
-		                        		    
-                    /* blend */		    
-                    psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
-	            pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
-	            psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
-	            paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
-	            pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
-		    packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
-		    pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */
-		    por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */
-		    movd_r2m(mm2, *dstp);/* mm2 -> dst */
-		  }
+			compositioning used (>>8 instead of /255) doesn't handle
+			it correctly. Also special-case alpha=0 for speed?
+			Benchmark this! */
+		if(alpha == 0) {
+			/* do nothing */
+		} else if(alpha == amask) {
+			/* opaque alpha -- copy RGB, keep dst alpha */
+			/* using MMX here to free up regular registers for other things */
+			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
+			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
+			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
+			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
+			por_r2r(mm1, mm2); /* src | dst -> mm2 */
+			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
+		} else {
+			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
+			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
+
+			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
+			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
+
+			__asm__ __volatile__ (
+				"movd %0, %%mm4"
+				: : "r" (alpha) ); /* 0000A000 -> mm4 */
+			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
+			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
+			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
+			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
+
+			/* blend */		    
+			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
+			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
+			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
+			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
+			
+			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
+			movd_r2m(mm2, *dstp);/* mm2 -> dst */
 		}
 		++srcp;
 		++dstp;
@@ -412,7 +421,220 @@
 	}
 	emms();
 }
-#endif
+/* End GCC_ASMBLIT */
+
+#elif MSVC_ASMBLIT
+/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
+static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
+{
+	int width = info->d_width;
+	int height = info->d_height;
+	Uint32 *srcp = (Uint32 *)info->s_pixels;
+	int srcskip = info->s_skip >> 2;
+	Uint32 *dstp = (Uint32 *)info->d_pixels;
+	int dstskip = info->d_skip >> 2;
+	Uint32 dalpha = info->dst->Amask;
+
+	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
+	
+	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
+	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
+	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
+
+	while (height--) {
+		int n = width;
+		if ( n & 1 ) {
+			Uint32 s = *srcp++;
+			Uint32 d = *dstp;
+			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
+				   + (s & d & 0x00010101)) | dalpha;
+			n--;
+		}
+		
+		for (n >>= 1; n > 0; --n) {
+			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
+			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
+
+			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
+			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
+
+			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
+			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
+			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
+			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
+
+			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
+			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
+			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
+			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
+			
+			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
+			dstp += 2;
+			srcp += 2;
+		}
+		
+		srcp += srcskip;
+		dstp += dstskip;
+	}
+	_mm_empty();
+}
+
+/* fast RGB888->(A)RGB888 blending with surface alpha */
+static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
+{
+	SDL_PixelFormat* df = info->dst;
+	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
+	unsigned alpha = info->src->alpha;
+
+	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
+			/* only call a128 version when R,G,B occupy lower bits */
+		BlitRGBtoRGBSurfaceAlpha128MMX(info);
+	} else {
+		int width = info->d_width;
+		int height = info->d_height;
+		Uint32 *srcp = (Uint32 *)info->s_pixels;
+		int srcskip = info->s_skip >> 2;
+		Uint32 *dstp = (Uint32 *)info->d_pixels;
+		int dstskip = info->d_skip >> 2;
+		Uint32 dalpha = df->Amask;
+		Uint32 amult;
+
+		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
+		
+		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
+		/* form the alpha mult */
+		amult = alpha | (alpha << 8);
+		amult = amult | (amult << 16);
+		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
+		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
+		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
+			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
+		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
+		
+		while (height--) {
+			int n = width;
+			if (n & 1) {
+				/* One Pixel Blend */
+				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
+				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
+
+				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
+				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
+
+				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
+				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
+				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
+				
+				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
+				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
+				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
+
+				++srcp;
+				++dstp;
+				
+				n--;
+			}
+
+			for (n >>= 1; n > 0; --n) {
+				/* Two Pixels Blend */
+				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
+				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
+				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
+				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
+
+				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
+				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
+				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
+				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
+
+				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
+				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
+				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
+				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
+
+				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
+				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
+				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
+				
+				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
+				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
+
+				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
+
+				srcp += 2;
+				dstp += 2;
+			}
+			srcp += srcskip;
+			dstp += dstskip;
+		}
+		_mm_empty();
+	}
+}
+
+/* fast ARGB888->(A)RGB888 blending with pixel alpha */
+static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
+{
+	int width = info->d_width;
+	int height = info->d_height;
+	Uint32 *srcp = (Uint32 *)info->s_pixels;
+	int srcskip = info->s_skip >> 2;
+	Uint32 *dstp = (Uint32 *)info->d_pixels;
+	int dstskip = info->d_skip >> 2;
+	SDL_PixelFormat* sf = info->src;
+	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
+	Uint32 amask = sf->Amask;
+	Uint32 ashift = sf->Ashift;
+	Uint64 multmask;
+
+	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
+
+	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
+	multmask = ~(0xFFFFi64 << (ashift * 2));
+	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
+
+	while(height--) {
+		DUFFS_LOOP4({
+		Uint32 alpha = *srcp & amask;
+		if (alpha == 0) {
+			/* do nothing */
+		} else if (alpha == amask) {
+			/* opaque alpha -- copy RGB, keep dst alpha */
+			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
+		} else {
+			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
+			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
+
+			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
+			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
+
+			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
+			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
+			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
+			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
+			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
+
+			/* blend */		    
+			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
+			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
+			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
+			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
+			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
+			
+			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
+		}
+		++srcp;
+		++dstp;
+	    }, width);
+	    srcp += srcskip;
+	    dstp += dstskip;
+	}
+	_mm_empty();
+}
+/* End MSVC_ASMBLIT */
+
+#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
 
 #if SDL_ALTIVEC_BLITTERS
 #if HAVE_ALTIVEC_H
@@ -1326,7 +1548,7 @@
 	}
 }
 
-#if MMX_ASMBLIT
+#if GCC_ASMBLIT
 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
 {
@@ -1336,38 +1558,61 @@
 	int srcskip = info->s_skip >> 2;
 	Uint32 *dstp = (Uint32 *)info->d_pixels;
 	int dstskip = info->d_skip >> 2;
-
-	Uint32 s;
-	Uint32 alpha;
+	SDL_PixelFormat* sf = info->src;
+	Uint32 amask = sf->Amask;
 
 	__asm__ (
 	/* make mm6 all zeros. */
 	"pxor       %%mm6, %%mm6\n"
 	
 	/* Make a mask to preserve the alpha. */
-	"pcmpeqb   %%mm7, %%mm7\n\t"            /* mm7(s) = FF FF FF FF | FF FF FF FF */
-	"psrlq     $16, %%mm7\n\t"		    /* mm7(s) = 00 00 FF FF | FF FF FF FF */
+	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
+	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
+	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
+	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
+	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
 
-		: );
+	/* form channel masks */
+	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
+	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
+	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
+	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
+	
+	/* get alpha channel shift */
+	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
+
+	  : /* nothing */ : "m" (sf->Amask), "m" (sf->Ashift) );
 
 	while(height--) {
 
 	    DUFFS_LOOP4({
+		Uint32 alpha;
 
 		__asm__ (
 		"prefetch 64(%0)\n"
 		"prefetch 64(%1)\n"
 			: : "r" (srcp), "r" (dstp) );
 
-		s = *srcp;
-		alpha = s >> 24;
+		alpha = *srcp & amask;
 		/* FIXME: Here we special-case opaque alpha since the
 		   compositioning used (>>8 instead of /255) doesn't handle
 		   it correctly. Also special-case alpha=0 for speed?
 		   Benchmark this! */
-		
-		if(alpha == SDL_ALPHA_OPAQUE) {
-		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
+		if(alpha == 0) {
+		    /* do nothing */
+		}
+		else if(alpha == amask) {
+			/* opaque alpha -- copy RGB, keep dst alpha */
+		    /* using MMX here to free up regular registers for other things */
+			    __asm__ (
+		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
+		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
+		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
+		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
+		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
+		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
+
+		     : : "r" (srcp), "r" (dstp) );
 		} 
 
 		else {
@@ -1383,10 +1628,11 @@
 		    /*"psrlw     $8, %%mm2\n" */
 		    
 		    /* else: */
-		    "movq      %%mm0, %%mm2\n"
-		    "psrld     $24, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
+		    "movd       %2,    %%mm2\n"
+		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
+		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
 
 		    /* move the colors into words. */
 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
@@ -1396,16 +1642,15 @@
 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
 
 		    /* A * (src-dst) */
-		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = As*As-d As*Rs-d | As*Gs-d  As*Bs-d */
-		    "pand      %%mm7, %%mm0\n"              /* to preserve dest alpha */
-		    "psrlw     $8,    %%mm0\n"		    /* mm0 = Ac>>8 Rc>>8 | Gc>>8  Bc>>8 */
-		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = Ac+Ad Rc+Rd | Gc+Gd  Bc+Bd */
+		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
+		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
+		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
 
 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
 		    
 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
 
-		     : : "r" (srcp), "r" (dstp) );
+		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
 
 		}
 		++srcp;
@@ -1419,7 +1664,76 @@
 	"emms\n"
 		:   );
 }
-#endif
+/* End GCC_ASMBLIT*/
+
+#elif MSVC_ASMBLIT
+/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
+static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
+{
+	int width = info->d_width;
+	int height = info->d_height;
+	Uint32 *srcp = (Uint32 *)info->s_pixels;
+	int srcskip = info->s_skip >> 2;
+	Uint32 *dstp = (Uint32 *)info->d_pixels;
+	int dstskip = info->d_skip >> 2;
+	SDL_PixelFormat* sf = info->src;
+	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
+	Uint32 amask = sf->Amask;
+	Uint32 ashift = sf->Ashift;
+	Uint64 multmask;
+	
+	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
+
+	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
+	multmask = ~(0xFFFFi64 << (ashift * 2));
+	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
+
+	while(height--) {
+	    DUFFS_LOOP4({
+		Uint32 alpha;
+
+		_m_prefetch(srcp + 16);
+		_m_prefetch(dstp + 16);
+
+		alpha = *srcp & amask;
+		if (alpha == 0) {
+			/* do nothing */
+		} else if (alpha == amask) {
+			/* copy RGB, keep dst alpha */
+			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
+		} else {
+			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
+			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
+
+			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
+			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
+
+			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
+			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
+			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
+			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
+			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
+
+			/* blend */		    
+			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
+			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
+			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
+			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
+			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
+			
+			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
+		}
+		++srcp;
+		++dstp;
+	    }, width);
+	    srcp += srcskip;
+	    dstp += dstskip;
+	}
+	_mm_empty();
+}
+/* End MSVC_ASMBLIT */
+
+#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
 
 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
 
@@ -1530,7 +1844,7 @@
 	}
 }
 
-#if MMX_ASMBLIT
+#if GCC_ASMBLIT
 /* fast RGB565->RGB565 blending with surface alpha */
 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
 {
@@ -1544,28 +1858,29 @@
 		int srcskip = info->s_skip >> 1;
 		Uint16 *dstp = (Uint16 *)info->d_pixels;
 		int dstskip = info->d_skip >> 1;
-	        Uint32 s, d;
-	        Uint8 load[8];
+		Uint32 s, d;
+		Uint8 load[8];
 	  
 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
-	        *(Uint64 *)load = alpha;
+		*(Uint64 *)load = alpha;
 		alpha >>= 3;		/* downscale alpha to 5 bits */
 
-                movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
-                punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
-                punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
+		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
+		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
+		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
+		/* position alpha to allow for mullo and mulhi on diff channels
+		   to reduce the number of operations */
+		psllq_i2r(3, mm0);
 	  
- 	        /* Setup the 565 color channel masks */
-	        *(Uint64 *)load = 0xF800F800F800F800ULL;
-		movq_m2r(*load, mm1); /* MASKRED -> mm1 */
+		/* Setup the 565 color channel masks */
 		*(Uint64 *)load = 0x07E007E007E007E0ULL;
 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
 		*(Uint64 *)load = 0x001F001F001F001FULL;
 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
 		while(height--) {
-                        DUFFS_LOOP_QUATRO2(
-                        {
-	                        s = *srcp++;
+			DUFFS_LOOP_QUATRO2(
+			{
+				s = *srcp++;
 				d = *dstp;
 				/*
 				 * shift out the middle component (green) to
@@ -1577,8 +1892,8 @@
 				d += (s - d) * alpha >> 5;
 				d &= 0x07e0f81f;
 				*dstp++ = d | d >> 16;
-                        },{
-	                        s = *srcp++;
+			},{
+				s = *srcp++;
 				d = *dstp;
 				/*
 				 * shift out the middle component (green) to
@@ -1590,7 +1905,7 @@
 				d += (s - d) * alpha >> 5;
 				d &= 0x07e0f81f;
 				*dstp++ = d | d >> 16;
-			        s = *srcp++;
+				s = *srcp++;
 				d = *dstp;
 				/*
 				 * shift out the middle component (green) to
@@ -1602,78 +1917,66 @@
 				d += (s - d) * alpha >> 5;
 				d &= 0x07e0f81f;
 				*dstp++ = d | d >> 16;
-                        },{
-	                        movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
-	                        movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
-			  
-	                        /* RED */
-	                        movq_r2r(mm2, mm5); /* src -> mm5 */
-	                        pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
-	                        psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
-	
-	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
-	                        pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
-	                        psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
-	
-	                        /* blend */
-	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
-	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
-	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
-	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-	                        psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */
-	                        pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
-	
-	                        movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
-	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
-	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
-	                        por_r2r(mm6, mm3); /* save new reds in dsts */
-	
-	                        /* green */
-	                        movq_r2r(mm2, mm5); /* src -> mm5 */
-	                        pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
-	                        psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
-	
-	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
-	                        pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
-	                        psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
-	
-	                        /* blend */
-	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
-	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
-	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
-	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-	                        psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
-	                        pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
-	
-	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
-	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
-	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
-	                        por_r2r(mm6, mm3); /* save new greens in dsts */
-	
-	                        /* blue */
-	                        movq_r2r(mm2, mm5); /* src -> mm5 */
-	                        pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
-		
-	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
-	                        pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
-	
-	                        /* blend */
-	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
-	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
-	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
-	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-	                        pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
-	
-	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
-	                        por_r2r(mm4, mm5);  /* MASKGREEN | mm5 -> mm5 */
-	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
-	                        por_r2r(mm6, mm3); /* save new blues in dsts */
-	
-	                        movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
-	
-	                        srcp += 4;
-	                        dstp += 4;
-                        }, width);			
+			},{
+				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
+				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
+
+				/* red -- does not need a mask since the right shift clears
+				   the uninteresting bits */
+				movq_r2r(mm2, mm5); /* src -> mm5 */
+				movq_r2r(mm3, mm6); /* dst -> mm6 */
+				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
+				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
+
+				/* blend */
+				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
+				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
+				/* alpha used is actually 11 bits
+				   11 + 5 = 16 bits, so the sign bits are lost */
+				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
+				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
+				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
+
+				movq_r2r(mm6, mm1); /* save new reds in dsts */
+
+				/* green -- process the bits in place */
+				movq_r2r(mm2, mm5); /* src -> mm5 */
+				movq_r2r(mm3, mm6); /* dst -> mm6 */
+				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
+				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
+
+				/* blend */
+				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
+				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
+				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
+				   bits are gone and the sign bits present */
+				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
+				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
+
+				por_r2r(mm6, mm1); /* save new greens in dsts */
+
+				/* blue */
+				movq_r2r(mm2, mm5); /* src -> mm5 */
+				movq_r2r(mm3, mm6); /* dst -> mm6 */
+				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
+				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
+
+				/* blend */
+				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
+				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
+				/* 11 + 5 = 16 bits, so the sign bits are lost and
+				   the interesting bits will need to be MASKed */
+				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
+				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
+				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
+
+				por_r2r(mm6, mm1); /* save new blues in dsts */
+
+				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
+
+				srcp += 4;
+				dstp += 4;
+			}, width);			
 			srcp += srcskip;
 			dstp += dstskip;
 		}
@@ -1694,28 +1997,29 @@
 		int srcskip = info->s_skip >> 1;
 		Uint16 *dstp = (Uint16 *)info->d_pixels;
 		int dstskip = info->d_skip >> 1;
-	        Uint32 s, d;
-	        Uint8 load[8];
+		Uint32 s, d;
+		Uint8 load[8];
 	  
 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
-	        *(Uint64 *)load = alpha;
+		*(Uint64 *)load = alpha;
 		alpha >>= 3;		/* downscale alpha to 5 bits */
 
-                movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
-                punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
-                punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
-	  
- 	        /* Setup the 555 color channel masks */
-	        *(Uint64 *)load = 0x7C007C007C007C00ULL;
-		movq_m2r(*load, mm1); /* MASKRED -> mm1 */
+		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
+		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
+		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
+		/* position alpha to allow for mullo and mulhi on diff channels
+		   to reduce the number of operations */
+		psllq_i2r(3, mm0);
+
+		/* Setup the 555 color channel masks */
 		*(Uint64 *)load = 0x03E003E003E003E0ULL;
 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
 		*(Uint64 *)load = 0x001F001F001F001FULL;
 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
 		while(height--) {
-                        DUFFS_LOOP_QUATRO2(
-                        {
-	                        s = *srcp++;
+			DUFFS_LOOP_QUATRO2(
+			{
+				s = *srcp++;
 				d = *dstp;
 				/*
 				 * shift out the middle component (green) to
@@ -1727,8 +2031,8 @@
 				d += (s - d) * alpha >> 5;
 				d &= 0x03e07c1f;
 				*dstp++ = d | d >> 16;
-                        },{
-	                        s = *srcp++;
+			},{
+				s = *srcp++;
 				d = *dstp;
 				/*
 				 * shift out the middle component (green) to
@@ -1752,85 +2056,349 @@
 				d += (s - d) * alpha >> 5;
 				d &= 0x03e07c1f;
 				*dstp++ = d | d >> 16;
-                        },{
-	                        movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
-	                        movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
-			  
-	                        /* RED */
-	                        movq_r2r(mm2, mm5); /* src -> mm5 */
-	                        pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
-	                        psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */
-	
-	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
-	                        pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
-	                        psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */
-	
-	                        /* blend */
-	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
-	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
-	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
-	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-	                        psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */
-	                        pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
-	
-	                        movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
-	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
-	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
-	                        por_r2r(mm6, mm3); /* save new reds in dsts */
-	
-	                        /* green */
-	                        movq_r2r(mm2, mm5); /* src -> mm5 */
-	                        pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
-	                        psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
-	
-	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
-	                        pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
-	                        psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
-	
-	                        /* blend */
-	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
-	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
-	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
-	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-	                        psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
-	                        pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
-	
-	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
-	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
-	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
-	                        por_r2r(mm6, mm3); /* save new greens in dsts */
-	
-	                        /* blue */
-	                        movq_r2r(mm2, mm5); /* src -> mm5 */
-	                        pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
-		
-	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
-	                        pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
-	
-	                        /* blend */
-	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
-	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
-	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
-	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
-	                        pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
-	
-	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
-	                        por_r2r(mm4, mm5);  /* MASKGREEN | mm5 -> mm5 */
-	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
-	                        por_r2r(mm6, mm3); /* save new blues in dsts */
-	
-	                        movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
-	
-	                        srcp += 4;
-	                        dstp += 4;
-                        }, width);			
+			},{
+				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
+				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
+
+				/* red -- process the bits in place */
+				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
+					/* by reusing the GREEN mask we free up another mmx
+					   register to accumulate the result */
+
+				movq_r2r(mm2, mm5); /* src -> mm5 */
+				movq_r2r(mm3, mm6); /* dst -> mm6 */
+				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
+				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
+
+				/* blend */
+				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
+				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
+				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
+				   cleared by a MASK below */
+				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
+				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
+				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
+
+				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
+
+				movq_r2r(mm6, mm1); /* save new reds in dsts */
+
+				/* green -- process the bits in place */
+				movq_r2r(mm2, mm5); /* src -> mm5 */
+				movq_r2r(mm3, mm6); /* dst -> mm6 */
+				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
+				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
+
+				/* blend */
+				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
+				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
+				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
+				   bits are gone and the sign bits present */
+				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
+				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
+
+				por_r2r(mm6, mm1); /* save new greens in dsts */
+
+				/* blue */
+				movq_r2r(mm2, mm5); /* src -> mm5 */
+				movq_r2r(mm3, mm6); /* dst -> mm6 */
+				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
+				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
+
+				/* blend */
+				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
+				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
+				/* 11 + 5 = 16 bits, so the sign bits are lost and
+				   the interesting bits will need to be MASKed */
+				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
+				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
+				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
+
+				por_r2r(mm6, mm1); /* save new blues in dsts */
+
+				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
+
+				srcp += 4;
+				dstp += 4;
+			}, width);			
 			srcp += srcskip;
 			dstp += dstskip;
 		}
 		emms();
 	}
 }
-#endif
+/* End GCC_ASMBLIT */
+
+#elif MSVC_ASMBLIT
+/* fast RGB565->RGB565 blending with surface alpha */
+static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
+{
+	unsigned alpha = info->src->alpha;
+	if(alpha == 128) {
+		Blit16to16SurfaceAlpha128(info, 0xf7de);
+	} else {
+		int width = info->d_width;
+		int height = info->d_height;
+		Uint16 *srcp = (Uint16 *)info->s_pixels;
+		int srcskip = info->s_skip >> 1;
+		Uint16 *dstp = (Uint16 *)info->d_pixels;
+		int dstskip = info->d_skip >> 1;
+		Uint32 s, d;
+	  
+		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
+
+		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
+		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
+		alpha >>= 3;		/* downscale alpha to 5 bits */
+
+		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
+		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
+		/* position alpha to allow for mullo and mulhi on diff channels
+		   to reduce the number of operations */
+		mm_alpha = _mm_slli_si64(mm_alpha, 3);
+	  
+		/* Setup the 565 color channel masks */
+		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
+		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
+		
+		while(height--) {
+			DUFFS_LOOP_QUATRO2(
+			{
+				s = *srcp++;
+				d = *dstp;
+				/*
+				 * shift out the middle component (green) to
+				 * the high 16 bits, and process all three RGB
+				 * components at the same time.
+				 */
+				s = (s | s << 16) & 0x07e0f81f;
+				d = (d | d << 16) & 0x07e0f81f;
+				d += (s - d) * alpha >> 5;
+				d &= 0x07e0f81f;
+				*dstp++ = d | d >> 16;
+			},{
+				s = *srcp++;
+				d = *dstp;
+				/*
+				 * shift out the middle component (green) to
+				 * the high 16 bits, and process all three RGB
+				 * components at the same time.
+				 */
+				s = (s | s << 16) & 0x07e0f81f;
+				d = (d | d << 16) & 0x07e0f81f;
+				d += (s - d) * alpha >> 5;
+				d &= 0x07e0f81f;
+				*dstp++ = d | d >> 16;
+				s = *srcp++;
+				d = *dstp;
+				/*
+				 * shift out the middle component (green) to
+				 * the high 16 bits, and process all three RGB
+				 * components at the same time.
+				 */
+				s = (s | s << 16) & 0x07e0f81f;
+				d = (d | d << 16) & 0x07e0f81f;
+				d += (s - d) * alpha >> 5;
+				d &= 0x07e0f81f;
+				*dstp++ = d | d >> 16;
+			},{
+				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
+				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
+
+				/* red */
+				src2 = src1;
+				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
+
+				dst2 = dst1;
+				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
+
+				/* blend */
+				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
+				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
+				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
+				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
+
+				mm_res = dst2; /* RED -> mm_res */
+
+				/* green -- process the bits in place */
+				src2 = src1;
+				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
+
+				dst2 = dst1;
+				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
+
+				/* blend */
+				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
+				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
+				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
+
+				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
+
+				/* blue */
+				src2 = src1;
+				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
+
+				dst2 = dst1;
+				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
+
+				/* blend */
+				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
+				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
+				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
+				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
+
+				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
+
+				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
+
+				srcp += 4;
+				dstp += 4;
+			}, width);			
+			srcp += srcskip;
+			dstp += dstskip;
+		}
+		_mm_empty();
+	}
+}
+
+/* fast RGB555->RGB555 blending with surface alpha */
+static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
+{
+	unsigned alpha = info->src->alpha;
+	if(alpha == 128) {
+		Blit16to16SurfaceAlpha128(info, 0xfbde);
+	} else {
+		int width = info->d_width;
+		int height = info->d_height;
+		Uint16 *srcp = (Uint16 *)info->s_pixels;
+		int srcskip = info->s_skip >> 1;
+		Uint16 *dstp = (Uint16 *)info->d_pixels;
+		int dstskip = info->d_skip >> 1;
+		Uint32 s, d;
+	  
+		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
+
+		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
+		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
+		alpha >>= 3;		/* downscale alpha to 5 bits */
+
+		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
+		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
+		/* position alpha to allow for mullo and mulhi on diff channels
+		   to reduce the number of operations */
+		mm_alpha = _mm_slli_si64(mm_alpha, 3);
+	  
+		/* Setup the 555 color channel masks */
+		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
+		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
+		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
+
+		while(height--) {
+			DUFFS_LOOP_QUATRO2(
+			{
+				s = *srcp++;
+				d = *dstp;
+				/*
+				 * shift out the middle component (green) to
+				 * the high 16 bits, and process all three RGB
+				 * components at the same time.
+				 */
+				s = (s | s << 16) & 0x03e07c1f;
+				d = (d | d << 16) & 0x03e07c1f;
+				d += (s - d) * alpha >> 5;
+				d &= 0x03e07c1f;
+				*dstp++ = d | d >> 16;
+			},{
+				s = *srcp++;
+				d = *dstp;
+				/*
+				 * shift out the middle component (green) to
+				 * the high 16 bits, and process all three RGB
+				 * components at the same time.
+				 */
+				s = (s | s << 16) & 0x03e07c1f;
+				d = (d | d << 16) & 0x03e07c1f;
+				d += (s - d) * alpha >> 5;
+				d &= 0x03e07c1f;
+				*dstp++ = d | d >> 16;
+			        s = *srcp++;
+				d = *dstp;
+				/*
+				 * shift out the middle component (green) to
+				 * the high 16 bits, and process all three RGB
+				 * components at the same time.
+				 */
+				s = (s | s << 16) & 0x03e07c1f;
+				d = (d | d << 16) & 0x03e07c1f;
+				d += (s - d) * alpha >> 5;
+				d &= 0x03e07c1f;
+				*dstp++ = d | d >> 16;
+			},{
+				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
+				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
+
+				/* red -- process the bits in place */
+				src2 = src1;
+				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
+
+				dst2 = dst1;
+				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
+
+				/* blend */
+				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
+				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
+				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
+				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
+
+				mm_res = dst2; /* RED -> mm_res */
+				
+				/* green -- process the bits in place */
+				src2 = src1;
+				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
+
+				dst2 = dst1;
+				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
+
+				/* blend */
+				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
+				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
+				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
+
+				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
+
+				/* blue */
+				src2 = src1; /* src -> src2 */
+				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
+
+				dst2 = dst1; /* dst -> dst2 */
+				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
+
+				/* blend */
+				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
+				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
+				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
+				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
+				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
+
+				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
+
+				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
+
+				srcp += 4;
+				dstp += 4;
+			}, width);			
+			srcp += srcskip;
+			dstp += dstskip;
+		}
+		_mm_empty();
+	}
+}
+#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
 
 /* fast RGB565->RGB565 blending with surface alpha */
 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
@@ -2177,29 +2745,31 @@
 		if(sf->Rmask == df->Rmask
 		   && sf->Gmask == df->Gmask
 		   && sf->Bmask == df->Bmask
-		   && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff
 		   && sf->BytesPerPixel == 4)
 		{
 #if MMX_ASMBLIT
-		if(SDL_HasMMX())
-		    return BlitRGBtoRGBSurfaceAlphaMMX;
+			if(sf->Rshift % 8 == 0
+			   && sf->Gshift % 8 == 0
+			   && sf->Bshift % 8 == 0
+			   && SDL_HasMMX())
+			    return BlitRGBtoRGBSurfaceAlphaMMX;
+#endif
+			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
+			{
+#if USE_ALTIVEC_BLITTERS
+				if(SDL_HasAltiVec())
+					return BlitRGBtoRGBSurfaceAlphaAltivec;
+#endif
+				return BlitRGBtoRGBSurfaceAlpha;
+			}
+		}
+#if SDL_ALTIVEC_BLITTERS
+		if((sf->BytesPerPixel == 4) &&
+		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
+			return Blit32to32SurfaceAlphaAltivec;
 		else
 #endif
-#if SDL_ALTIVEC_BLITTERS
-	if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
-            return BlitRGBtoRGBSurfaceAlphaAltivec;
-        else
-#endif
-		    return BlitRGBtoRGBSurfaceAlpha;
-		}
-		else
-#if SDL_ALTIVEC_BLITTERS
-        if((sf->BytesPerPixel == 4) &&
-	   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
-            return Blit32to32SurfaceAlphaAltivec;
-        else
-#endif
-		    return BlitNtoNSurfaceAlpha;
+			return BlitNtoNSurfaceAlpha;
 
 	    case 3:
 	    default:
@@ -2232,34 +2802,40 @@
 	    return BlitNtoNPixelAlpha;
 
 	case 4:
-	    if(sf->Amask == 0xff000000
-	       && sf->Rmask == df->Rmask
+	    if(sf->Rmask == df->Rmask
 	       && sf->Gmask == df->Gmask
 	       && sf->Bmask == df->Bmask
 	       && sf->BytesPerPixel == 4)
 	    {
 #if MMX_ASMBLIT
-		if(SDL_Has3DNow())
-		    return BlitRGBtoRGBPixelAlphaMMX3DNOW;
-		else
-		if(SDL_HasMMX())
-		    return BlitRGBtoRGBPixelAlphaMMX;
-		else
+		if(sf->Rshift % 8 == 0
+		   && sf->Gshift % 8 == 0
+		   && sf->Bshift % 8 == 0
+		   && sf->Ashift % 8 == 0
+		   && sf->Aloss == 0)
+		{
+			if(SDL_Has3DNow())
+				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
+			if(SDL_HasMMX())
+				return BlitRGBtoRGBPixelAlphaMMX;
+		}
 #endif
-#if SDL_ALTIVEC_BLITTERS
-	if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
-            return BlitRGBtoRGBPixelAlphaAltivec;
-        else
+		if(sf->Amask == 0xff000000)
+		{
+#if USE_ALTIVEC_BLITTERS
+			if(SDL_HasAltiVec())
+				return BlitRGBtoRGBPixelAlphaAltivec;
 #endif
-		    return BlitRGBtoRGBPixelAlpha;
+			return BlitRGBtoRGBPixelAlpha;
+		}
 	    }
 #if SDL_ALTIVEC_BLITTERS
-        if (sf->Amask && sf->BytesPerPixel == 4 &&
-	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
-            return Blit32to32PixelAlphaAltivec;
-        else
+	    if (sf->Amask && sf->BytesPerPixel == 4 &&
+	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
+		return Blit32to32PixelAlphaAltivec;
+	    else
 #endif
-	    return BlitNtoNPixelAlpha;
+		return BlitNtoNPixelAlpha;
 
 	case 3:
 	default: