author  Sam Lantinga <slouken@libsdl.org> 
/* 
2 
SDL  Simple DirectMedia Layer 

3 
Copyright (C) 19972004 Sam Lantinga 
0  4 

5 
This library is free software; you can redistribute it and/or 

6 
modify it under the terms of the GNU Library General Public 

7 
License as published by the Free Software Foundation; either 

8 
version 2 of the License, or (at your option) any later version. 

9 

10 
This library is distributed in the hope that it will be useful, 

11 
but WITHOUT ANY WARRANTY; without even the implied warranty of 

12 
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

13 
Library General Public License for more details. 

14 

15 
You should have received a copy of the GNU Library General Public 

16 
License along with this library; if not, write to the Free 

17 
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 021111307 USA 

18 

19 
Sam Lantinga 

20 
slouken@libsdl.org 
0  21 
*/ 
22 

23 
#ifdef SAVE_RCSID 

24 
static char rcsid = 

25 
"@(#) $Id$"; 

26 
#endif 

27 

28 

29 
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) 

30 

31 
#include "SDL_types.h" 

32 

33 
#if __GNUC__ > 2 
34 
# undef GCC2_HACK 
35 
#else 
36 
# define GCC2_HACK 
37 
#endif 
0  38 

39 

40 
#if defined(GCC2_HACK) && defined (__ELF__) 
41 
#define ASM_VAR(X) _##X 
42 
#else 
43 
#define ASM_VAR(X) X 
44 
#endif 
45 

46 
static volatile unsigned int ASM_VAR(MMX_0080w)[] = {0x00800080, 0x00800080}; 
47 
static volatile unsigned int ASM_VAR(MMX_00FFw)[] = {0x00ff00ff, 0x00ff00ff}; 
48 
static volatile unsigned int ASM_VAR(MMX_FF00w)[] = {0xff00ff00, 0xff00ff00}; 
0  49 

50 
static volatile unsigned short ASM_VAR(MMX_Ycoeff)[] = {0x004a, 0x004a, 0x004a, 0x004a}; 
51 

52 
static volatile unsigned short ASM_VAR(MMX_UbluRGB)[] = {0x0072, 0x0072, 0x0072, 0x0072}; 
53 
static volatile unsigned short ASM_VAR(MMX_VredRGB)[] = {0x0059, 0x0059, 0x0059, 0x0059}; 
54 
static volatile unsigned short ASM_VAR(MMX_UgrnRGB)[] = {0xffea, 0xffea, 0xffea, 0xffea}; 
55 
static volatile unsigned short ASM_VAR(MMX_VgrnRGB)[] = {0xffd2, 0xffd2, 0xffd2, 0xffd2}; 
0  56 

57 
static volatile unsigned short ASM_VAR(MMX_Ublu5x5)[] = {0x0081, 0x0081, 0x0081, 0x0081}; 
58 
static volatile unsigned short ASM_VAR(MMX_Vred5x5)[] = {0x0066, 0x0066, 0x0066, 0x0066}; 
59 
static volatile unsigned short ASM_VAR(MMX_Ugrn555)[] = {0xffe7, 0xffe7, 0xffe7, 0xffe7}; 
60 
static volatile unsigned short ASM_VAR(MMX_Vgrn555)[] = {0xffcc, 0xffcc, 0xffcc, 0xffcc}; 
61 
static volatile unsigned short ASM_VAR(MMX_Ugrn565)[] = {0xffe8, 0xffe8, 0xffe8, 0xffe8}; 
62 
static volatile unsigned short ASM_VAR(MMX_Vgrn565)[] = {0xffcd, 0xffcd, 0xffcd, 0xffcd}; 
63 

64 
static volatile unsigned short ASM_VAR(MMX_red555)[] = {0x7c00, 0x7c00, 0x7c00, 0x7c00}; 
65 
static volatile unsigned short ASM_VAR(MMX_red565)[] = {0xf800, 0xf800, 0xf800, 0xf800}; 
66 
static volatile unsigned short ASM_VAR(MMX_grn555)[] = {0x03e0, 0x03e0, 0x03e0, 0x03e0}; 
67 
static volatile unsigned short ASM_VAR(MMX_grn565)[] = {0x07e0, 0x07e0, 0x07e0, 0x07e0}; 
68 
static volatile unsigned short ASM_VAR(MMX_blu5x5)[] = {0x001f, 0x001f, 0x001f, 0x001f}; 
0  69 

70 
/** 

71 
This MMX assembler is my first assembler/MMX program ever. 

72 
Thus it maybe buggy. 

73 
Send patches to: 

74 
mvogt@rhrk.unikl.de 

75 

76 
After it worked fine I have "obfuscated" the code a bit to have 

77 
more parallism in the MMX units. This means I moved 

78 
initilisation around and delayed other instruction. 

79 
Performance measurement did not show that this brought any advantage 

80 
but in theory it _should_ be faster this way. 

81 

82 
The overall performanve gain to the C based dither was 30%40%. 

83 
The MMX routine calculates 256bit=8RGB values in each cycle 

84 
(4 for row1 & 4 for row2) 

85 

86 
The red/green/blue.. coefficents are taken from the mpeg_play 

87 
player. They look nice, but I dont know if you can have 

88 
better values, to avoid integer rounding errors. 

89 

90 

91 
IMPORTANT: 

92 
========== 

93 

94 
It is a requirement that the cr/cb/lum are 8 byte aligned and 

95 
the out are 16byte aligned or you will/may get segfaults 

96 

97 
*/ 

98 

99 
void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, 

100 
unsigned char *lum, unsigned char *cr, 

101 
unsigned char *cb, unsigned char *out, 

102 
int rows, int cols, int mod ) 

103 
{ 

104 
Uint32 *row1; 

105 
Uint32 *row2; 

106 

107 
unsigned char* y = lum +cols*rows; // Pointer to the end 

108 
int x=0; 

109 
row1 = (Uint32 *)out; // 32 bit target 

110 
row2 = (Uint32 *)out+cols+mod; // start of second row 

111 
mod = (mod+cols+mod)*4; // increment for row1 in byte 

112 

113 
__asm__ __volatile__ ( 

114 
/* We don't really care about PIC  the code should be rewritten to use 

115 
relative addressing for the static tables, so right now we take the 

116 
COW hit on the pages this code resides. Big deal. 

117 
This spill is just to reduce register pressure in the PIC case. */ 

118 
"pushl %%ebx\n" 

119 
"movl %0, %%ebx\n" 

120 

121 
".align 8\n" 

122 
"1:\n" 

123 

124 
// create Cr (result in mm1) 

125 
"movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 

126 
"pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 

127 
"movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 

128 
"punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 

129 
"punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 

130 
#ifdef GCC2_HACK 
131 
"psubw _MMX_0080w,%%mm1\n" // mm1128:r1 r1 r0 r0 r1 r1 r0 r0 
132 
#else 
133 
"psubw %[_MMX_0080w],%%mm1\n" // mm1128:r1 r1 r0 r0 r1 r1 r0 r0 
134 
#endif 
0  135 

136 
// create Cr_g (result in mm0) 

137 
"movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 

138 
#ifdef GCC2_HACK 
139 
"pmullw _MMX_VgrnRGB,%%mm0\n"// red*46dec=0.7136*64 
140 
"pmullw _MMX_VredRGB,%%mm1\n"// red*89dec=1.4013*64 
141 
#else 
142 
"pmullw %[_MMX_VgrnRGB],%%mm0\n"// red*46dec=0.7136*64 
143 
"pmullw %[_MMX_VredRGB],%%mm1\n"// red*89dec=1.4013*64 
144 
#endif 
0  145 
"psraw $6, %%mm0\n" // red=red/64 
146 
"psraw $6, %%mm1\n" // red=red/64 

147 

148 
// create L1 L2 (result in mm2,mm4) 

149 
// L2=lum+cols 

150 
"movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 

151 
"punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 

152 
"movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 

153 
#ifdef GCC2_HACK 
154 
"pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 
155 
"pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 
156 
#else 
157 
"pand %[_MMX_FF00w],%%mm2\n" // L3 0 L1 0 l3 0 l1 0 
158 
"pand %[_MMX_00FFw],%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 
159 
#endif 
0  160 
"psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 
161 

162 
// create R (result in mm6) 

163 
"movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 

164 
"movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 

165 
"paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1 

166 
"paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0 

167 
"packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1 

168 
"packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0 

169 
"pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 

170 
"punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0 

171 

172 
// create Cb (result in mm1) 

173 
"movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 

174 
"punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 

175 
"punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 

176 
#ifdef GCC2_HACK 
177 
"psubw _MMX_0080w,%%mm1\n" // mm1128:u1 u1 u0 u0 u1 u1 u0 u0 
178 
#else 
179 
"psubw %[_MMX_0080w],%%mm1\n" // mm1128:u1 u1 u0 u0 u1 u1 u0 u0 
180 
#endif 
0  181 
// create Cb_g (result in mm5) 
182 
"movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 

183 
#ifdef GCC2_HACK 
184 
"pmullw _MMX_UgrnRGB,%%mm5\n" // blue*109dec=1.7129*64 
185 
"pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64 
186 
#else 
187 
"pmullw %[_MMX_UgrnRGB],%%mm5\n" // blue*109dec=1.7129*64 
188 
"pmullw %[_MMX_UbluRGB],%%mm1\n" // blue*114dec=1.78125*64 
189 
#endif 
0  190 
"psraw $6, %%mm5\n" // blue=red/64 
191 
"psraw $6, %%mm1\n" // blue=blue/64 

192 

193 
// create G (result in mm7) 

194 
"movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 

195 
"movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1 

196 
"paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t 

197 
"paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t 

198 
"paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1 

199 
"paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0 

200 
"packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1 

201 
"packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0 

202 
"punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0 

203 

204 
// create B (result in mm5) 

205 
"movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 

206 
"movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1 

207 
"paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1 

208 
"paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0 

209 
"packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1 

210 
"packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0 

211 
"punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0 

212 

213 
// fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 

214 

215 
"pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 

216 
"pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0 

217 
"movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0 

218 
"movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0 

219 
// process lower lum 

220 
"punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0 

221 
"punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0 

222 
"movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0 

223 
"movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0 

224 
"punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0 

225 
"punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2 

226 

227 
"pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 

228 
"movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0 

229 
"punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0 

230 
"punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0 

231 
"por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0 

232 
"movq %%mm2,(%3)\n" // wrote out ! row1 

233 

234 
"pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 

235 
"punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0 

236 
"punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0 

237 
"por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2 

238 
"movq %%mm4,8(%3)\n" // wrote out ! row1 

239 

240 
// fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 

241 
// this can be done "destructive" 

242 
"pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 

243 
"punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0 

244 
"punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0 

245 
"movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0 

246 
"punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0 

247 
"movq %%mm1,(%5)\n" // wrote out ! row2 

248 
"punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2 

249 
"movq %%mm5,8(%5)\n" // wrote out ! row2 

250 

251 
"addl $4,%2\n" // lum+4 

252 
"leal 16(%3),%3\n" // row1+16 

253 
"leal 16(%5),%5\n" // row2+16 

254 
"addl $2, %%ebx\n" // cr+2 

255 
"addl $2, %1\n" // cb+2 

256 

257 
"addl $4,%6\n" // x+4 

258 
"cmpl %4,%6\n" 

259 

260 
"jl 1b\n" 

261 
"addl %4, %2\n" // lum += cols 

262 
"addl %8, %3\n" // row1+= mod 

263 
"addl %8, %5\n" // row2+= mod 

264 
"movl $0, %6\n" // x=0 

265 
"cmpl %7, %2\n" 

266 
"jl 1b\n" 

267 
"emms\n" 

268 
"popl %%ebx\n" 

269 
: 

270 
: "m" (cr), "r"(cb),"r"(lum), 

271 
"r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod) 
272 
#ifndef GCC2_HACK 
273 
,[_MMX_0080w] "m" (*MMX_0080w), 
274 
[_MMX_00FFw] "m" (*MMX_00FFw), 
275 
[_MMX_FF00w] "m" (*MMX_FF00w), 
276 
[_MMX_VgrnRGB] "m" (*MMX_VgrnRGB), 
277 
[_MMX_VredRGB] "m" (*MMX_VredRGB), 
278 
[_MMX_UgrnRGB] "m" (*MMX_UgrnRGB), 
279 
[_MMX_UbluRGB] "m" (*MMX_UbluRGB) 
280 
#endif 
281 
); 
0  282 
} 
283 

284 
void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, 

285 
unsigned char *lum, unsigned char *cr, 

286 
unsigned char *cb, unsigned char *out, 

287 
int rows, int cols, int mod ) 

288 
{ 

289 
Uint16 *row1; 

290 
Uint16 *row2; 

291 

292 
unsigned char* y = lum +cols*rows; /* Pointer to the end */ 

293 
int x=0; 

294 
row1 = (Uint16 *)out; /* 16 bit target */ 

295 
row2 = (Uint16 *)out+cols+mod; /* start of second row */ 

296 
mod = (mod+cols+mod)*2; /* increment for row1 in byte */ 

297 

298 

299 
__asm__ __volatile__( 

300 
"pushl %%ebx\n" 

301 
"movl %0, %%ebx\n" 

302 

303 
".align 8\n" 

304 
"1:\n" 

305 
"movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 

306 
"pxor %%mm7, %%mm7\n" 

307 
"movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 

308 
"punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 

309 
"punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 

310 
#ifdef GCC2_HACK 
311 
"psubw _MMX_0080w, %%mm0\n" 
312 
"psubw _MMX_0080w, %%mm1\n" 
313 
#else 
314 
"psubw %[_MMX_0080w], %%mm0\n" 
315 
"psubw %[_MMX_0080w], %%mm1\n" 
316 
#endif 
0  317 
"movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 
318 
"movq %%mm1, %%mm3\n" // Cr 

319 
#ifdef GCC2_HACK 
320 
"pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 
321 
#else 
322 
"pmullw %[_MMX_Ugrn565], %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 
323 
#endif 
0  324 
"movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 
325 
#ifdef GCC2_HACK 
326 
"pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue 
327 
"pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 
328 
"pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green 
329 
#else 
330 
"pmullw %[_MMX_Ublu5x5], %%mm0\n" // Cb2blue 
331 
"pand %[_MMX_00FFw], %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 
332 
"pmullw %[_MMX_Vgrn565], %%mm3\n" // Cr2green 
333 
#endif 
0  334 
"movq (%2), %%mm7\n" // L2 
335 
#ifdef GCC2_HACK 
336 
"pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red 
337 
#else 
338 
"pmullw %[_MMX_Vred5x5], %%mm1\n" // Cr2red 
339 
#endif 
0  340 
"psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 
341 
#ifdef GCC2_HACK 
342 
"pmullw _MMX_Ycoeff, %%mm6\n" // lum1 
343 
#else 
344 
"pmullw %[_MMX_Ycoeff], %%mm6\n" // lum1 
345 
#endif 
0  346 
"paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green 
347 
#ifdef GCC2_HACK 
348 
"pmullw _MMX_Ycoeff, %%mm7\n" // lum2 
349 
#else 
350 
"pmullw %[_MMX_Ycoeff], %%mm7\n" // lum2 
351 
#endif 
0  352 

353 
"movq %%mm6, %%mm4\n" // lum1 

354 
"paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 

355 
"movq %%mm4, %%mm5\n" // lum1 

356 
"paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 

357 
"paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0 

358 
"psraw $6, %%mm4\n" // R1 0 .. 64 

359 
"movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1 

360 
"psraw $6, %%mm5\n" // G1  .. + 

361 
"paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1 

362 
"psraw $6, %%mm6\n" // B1 0 .. 64 

363 
"packuswb %%mm4, %%mm4\n" // R1 R1 

364 
"packuswb %%mm5, %%mm5\n" // G1 G1 

365 
"packuswb %%mm6, %%mm6\n" // B1 B1 

366 
"punpcklbw %%mm4, %%mm4\n" 

367 
"punpcklbw %%mm5, %%mm5\n" 

368 

369 
#ifdef GCC2_HACK 
370 
"pand _MMX_red565, %%mm4\n" 
371 
#else 
372 
"pand %[_MMX_red565], %%mm4\n" 
373 
#endif 
0  374 
"psllw $3, %%mm5\n" // GREEN 1 
375 
"punpcklbw %%mm6, %%mm6\n" 

376 
#ifdef GCC2_HACK 
377 
"pand _MMX_grn565, %%mm5\n" 
378 
"pand _MMX_red565, %%mm6\n" 
379 
#else 
380 
"pand %[_MMX_grn565], %%mm5\n" 
381 
"pand %[_MMX_red565], %%mm6\n" 
382 
#endif 
0  383 
"por %%mm5, %%mm4\n" // 
384 
"psrlw $11, %%mm6\n" // BLUE 1 

385 
"movq %%mm3, %%mm5\n" // lum2 

386 
"paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 

387 
"paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 

388 
"psraw $6, %%mm3\n" // R2 

389 
"por %%mm6, %%mm4\n" // MM4 

390 
"psraw $6, %%mm5\n" // G2 

391 
"movq (%2, %4), %%mm6\n" // L3 load lum2 

392 
"psraw $6, %%mm7\n" 

393 
"packuswb %%mm3, %%mm3\n" 

394 
"packuswb %%mm5, %%mm5\n" 

395 
"packuswb %%mm7, %%mm7\n" 

396 
#ifdef GCC2_HACK 
397 
"pand _MMX_00FFw, %%mm6\n" // L3 
398 
#else 
399 
"pand %[_MMX_00FFw], %%mm6\n" // L3 
400 
#endif 
0  401 
"punpcklbw %%mm3, %%mm3\n" 
402 
"punpcklbw %%mm5, %%mm5\n" 

403 
#ifdef GCC2_HACK 
404 
"pmullw _MMX_Ycoeff, %%mm6\n" // lum3 
405 
#else 
406 
"pmullw %[_MMX_Ycoeff], %%mm6\n" // lum3 
407 
#endif 
0  408 
"punpcklbw %%mm7, %%mm7\n" 
409 
"psllw $3, %%mm5\n" // GREEN 2 

410 
#ifdef GCC2_HACK 
411 
"pand _MMX_red565, %%mm7\n" 
412 
"pand _MMX_red565, %%mm3\n" 
413 
#else 
414 
"pand %[_MMX_red565], %%mm7\n" 
415 
"pand %[_MMX_red565], %%mm3\n" 
416 
#endif 
0  417 
"psrlw $11, %%mm7\n" // BLUE 2 
418 
#ifdef GCC2_HACK 
419 
"pand _MMX_grn565, %%mm5\n" 
420 
#else 
421 
"pand %[_MMX_grn565], %%mm5\n" 
422 
#endif 
0  423 
"por %%mm7, %%mm3\n" 
424 
"movq (%2,%4), %%mm7\n" // L4 load lum2 

425 
"por %%mm5, %%mm3\n" // 

426 
"psrlw $8, %%mm7\n" // L4 

427 
"movq %%mm4, %%mm5\n" 

428 
"punpcklwd %%mm3, %%mm4\n" 

429 
#ifdef GCC2_HACK 
430 
"pmullw _MMX_Ycoeff, %%mm7\n" // lum4 
431 
#else 
432 
"pmullw %[_MMX_Ycoeff], %%mm7\n" // lum4 
433 
#endif 
0  434 
"punpckhwd %%mm3, %%mm5\n" 
435 

436 
"movq %%mm4, (%3)\n" // write row1 

437 
"movq %%mm5, 8(%3)\n" // write row1 

438 

439 
"movq %%mm6, %%mm4\n" // Lum3 

440 
"paddw %%mm0, %%mm6\n" // Lum3 +blue 

441 

442 
"movq %%mm4, %%mm5\n" // Lum3 

443 
"paddw %%mm1, %%mm4\n" // Lum3 +red 

444 
"paddw %%mm2, %%mm5\n" // Lum3 +green 

445 
"psraw $6, %%mm4\n" 

446 
"movq %%mm7, %%mm3\n" // Lum4 

447 
"psraw $6, %%mm5\n" 

448 
"paddw %%mm0, %%mm7\n" // Lum4 +blue 

449 
"psraw $6, %%mm6\n" // Lum3 +blue 

450 
"movq %%mm3, %%mm0\n" // Lum4 

451 
"packuswb %%mm4, %%mm4\n" 

452 
"paddw %%mm1, %%mm3\n" // Lum4 +red 

453 
"packuswb %%mm5, %%mm5\n" 

454 
"paddw %%mm2, %%mm0\n" // Lum4 +green 

455 
"packuswb %%mm6, %%mm6\n" 

456 
"punpcklbw %%mm4, %%mm4\n" 

457 
"punpcklbw %%mm5, %%mm5\n" 

458 
"punpcklbw %%mm6, %%mm6\n" 

459 
"psllw $3, %%mm5\n" // GREEN 3 

460 
#ifdef GCC2_HACK 
461 
"pand _MMX_red565, %%mm4\n" 
462 
#else 
463 
"pand %[_MMX_red565], %%mm4\n" 
464 
#endif 
0  465 
"psraw $6, %%mm3\n" // psr 6 
466 
"psraw $6, %%mm0\n" 

467 
#ifdef GCC2_HACK 
468 
"pand _MMX_red565, %%mm6\n" // BLUE 
469 
"pand _MMX_grn565, %%mm5\n" 
470 
#else 
471 
"pand %[_MMX_red565], %%mm6\n" // BLUE 
472 
"pand %[_MMX_grn565], %%mm5\n" 
473 
#endif 
0  474 
"psrlw $11, %%mm6\n" // BLUE 3 
475 
"por %%mm5, %%mm4\n" 

476 
"psraw $6, %%mm7\n" 

477 
"por %%mm6, %%mm4\n" 

478 
"packuswb %%mm3, %%mm3\n" 

479 
"packuswb %%mm0, %%mm0\n" 

480 
"packuswb %%mm7, %%mm7\n" 

481 
"punpcklbw %%mm3, %%mm3\n" 

482 
"punpcklbw %%mm0, %%mm0\n" 

483 
"punpcklbw %%mm7, %%mm7\n" 

484 
#ifdef GCC2_HACK 
485 
"pand _MMX_red565, %%mm3\n" 
486 
"pand _MMX_red565, %%mm7\n" // BLUE 
487 
#else 
488 
"pand %[_MMX_red565], %%mm3\n" 
489 
"pand %[_MMX_red565], %%mm7\n" // BLUE 
490 
#endif 
0  491 
"psllw $3, %%mm0\n" // GREEN 4 
492 
"psrlw $11, %%mm7\n" 

493 
#ifdef GCC2_HACK 
494 
"pand _MMX_grn565, %%mm0\n" 
495 
#else 
496 
"pand %[_MMX_grn565], %%mm0\n" 
497 
#endif 
0  498 
"por %%mm7, %%mm3\n" 
499 
"por %%mm0, %%mm3\n" 

500 

501 
"movq %%mm4, %%mm5\n" 

502 

503 
"punpcklwd %%mm3, %%mm4\n" 

504 
"punpckhwd %%mm3, %%mm5\n" 

505 

506 
"movq %%mm4, (%5)\n" 

507 
"movq %%mm5, 8(%5)\n" 

508 

509 
"addl $8, %6\n" 

510 
"addl $8, %2\n" 

511 
"addl $4, %%ebx\n" 

512 
"addl $4, %1\n" 

513 
"cmpl %4, %6\n" 

514 
"leal 16(%3), %3\n" 

515 
"leal 16(%5),%5\n" // row2+16 

516 

517 

518 
"jl 1b\n" 

519 
"addl %4, %2\n" // lum += cols 

520 
"addl %8, %3\n" // row1+= mod 

521 
"addl %8, %5\n" // row2+= mod 

522 
"movl $0, %6\n" // x=0 

523 
"cmpl %7, %2\n" 

524 
"jl 1b\n" 

525 
"emms\n" 

526 
"popl %%ebx\n" 

527 
: 
528 
:"m" (cr), "r"(cb),"r"(lum), 
529 
"r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod) 
530 
#ifndef GCC2_HACK 
531 
,[_MMX_0080w] "m" (*MMX_0080w), 
532 
[_MMX_Ugrn565] "m" (*MMX_Ugrn565), 
533 
[_MMX_Ublu5x5] "m" (*MMX_Ublu5x5), 
534 
[_MMX_00FFw] "m" (*MMX_00FFw), 
535 
[_MMX_Vgrn565] "m" (*MMX_Vgrn565), 
536 
[_MMX_Vred5x5] "m" (*MMX_Vred5x5), 
537 
[_MMX_Ycoeff] "m" (*MMX_Ycoeff), 
538 
[_MMX_red565] "m" (*MMX_red565), 
539 
[_MMX_grn565] "m" (*MMX_grn565) 
540 
#endif 
541 
); 
0  542 
} 
543 

544 
#undef GCC2_HACK 
545 

0  546 
#endif /* GCC i386 inline assembly */ 