30 #if COMPILE_TEMPLATE_MMXEXT 31 #define PREFETCH "prefetchnta" 33 #define PREFETCH " # nop" 36 #if COMPILE_TEMPLATE_MMXEXT 37 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 39 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 41 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 43 #define YSCALEYUV2PACKEDX_UV \ 45 "xor %%"REG_a", %%"REG_a" \n\t"\ 49 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 50 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 51 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 52 "movq %%mm3, %%mm4 \n\t"\ 55 "movq 8(%%"REG_d"), %%mm0 \n\t" \ 56 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \ 57 "add %6, %%"REG_S" \n\t" \ 58 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" \ 59 "add $16, %%"REG_d" \n\t"\ 60 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 61 "pmulhw %%mm0, %%mm2 \n\t"\ 62 "pmulhw %%mm0, %%mm5 \n\t"\ 63 "paddw %%mm2, %%mm3 \n\t"\ 64 "paddw %%mm5, %%mm4 \n\t"\ 65 "test %%"REG_S", %%"REG_S" \n\t"\ 68 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 69 "lea "offset"(%0), %%"REG_d" \n\t"\ 70 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 71 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 72 "movq "#dst1", "#dst2" \n\t"\ 75 "movq 8(%%"REG_d"), "#coeff" \n\t" \ 76 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" \ 77 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" \ 78 "add $16, %%"REG_d" \n\t"\ 79 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 80 "pmulhw "#coeff", "#src1" \n\t"\ 81 "pmulhw "#coeff", "#src2" \n\t"\ 82 "paddw "#src1", "#dst1" \n\t"\ 83 "paddw "#src2", "#dst2" \n\t"\ 84 "test %%"REG_S", %%"REG_S" \n\t"\ 87 #define YSCALEYUV2PACKEDX \ 88 YSCALEYUV2PACKEDX_UV \ 89 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 91 #define YSCALEYUV2PACKEDX_END \ 92 :: "r" (&c->redDither), \ 93 "m" (dummy), "m" (dummy), "m" (dummy),\ 94 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 95 : "%"REG_a, "%"REG_d, "%"REG_S \ 98 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 100 "xor %%"REG_a", %%"REG_a" \n\t"\ 104 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 105 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 106 "pxor %%mm4, %%mm4 \n\t"\ 107 "pxor %%mm5, %%mm5 \n\t"\ 108 "pxor %%mm6, %%mm6 \n\t"\ 109 "pxor %%mm7, %%mm7 \n\t"\ 112 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" \ 113 "add %6, %%"REG_S" \n\t" \ 114 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \ 115 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 116 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" \ 117 "movq %%mm0, %%mm3 \n\t"\ 118 "punpcklwd %%mm1, %%mm0 \n\t"\ 119 "punpckhwd %%mm1, %%mm3 \n\t"\ 120 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" \ 121 "pmaddwd %%mm1, %%mm0 \n\t"\ 122 "pmaddwd %%mm1, %%mm3 \n\t"\ 123 "paddd %%mm0, %%mm4 \n\t"\ 124 "paddd %%mm3, %%mm5 \n\t"\ 125 "add %6, %%"REG_S" \n\t" \ 126 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" \ 127 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 128 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 129 "test %%"REG_S", %%"REG_S" \n\t"\ 130 "movq %%mm2, %%mm0 \n\t"\ 131 "punpcklwd %%mm3, %%mm2 \n\t"\ 132 "punpckhwd %%mm3, %%mm0 \n\t"\ 133 "pmaddwd %%mm1, %%mm2 \n\t"\ 134 "pmaddwd %%mm1, %%mm0 \n\t"\ 135 "paddd %%mm2, %%mm6 \n\t"\ 136 "paddd %%mm0, %%mm7 \n\t"\ 138 "psrad $16, %%mm4 \n\t"\ 139 "psrad $16, %%mm5 \n\t"\ 140 "psrad $16, %%mm6 \n\t"\ 141 "psrad $16, %%mm7 \n\t"\ 142 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 143 "packssdw %%mm5, %%mm4 \n\t"\ 144 "packssdw %%mm7, %%mm6 \n\t"\ 145 "paddw %%mm0, %%mm4 \n\t"\ 146 "paddw %%mm0, %%mm6 \n\t"\ 147 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 148 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 150 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 151 "lea "offset"(%0), %%"REG_d" \n\t"\ 152 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 153 "pxor %%mm1, %%mm1 \n\t"\ 154 "pxor %%mm5, %%mm5 \n\t"\ 155 "pxor %%mm7, %%mm7 \n\t"\ 156 "pxor %%mm6, %%mm6 \n\t"\ 159 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \ 160 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \ 161 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 162 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" \ 163 "movq %%mm0, %%mm3 \n\t"\ 164 "punpcklwd %%mm4, %%mm0 \n\t"\ 165 "punpckhwd %%mm4, %%mm3 \n\t"\ 166 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" \ 167 "pmaddwd %%mm4, %%mm0 \n\t"\ 168 "pmaddwd %%mm4, %%mm3 \n\t"\ 169 "paddd %%mm0, %%mm1 \n\t"\ 170 "paddd %%mm3, %%mm5 \n\t"\ 171 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \ 172 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 173 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 174 "test %%"REG_S", %%"REG_S" \n\t"\ 175 "movq %%mm2, %%mm0 \n\t"\ 176 "punpcklwd %%mm3, %%mm2 \n\t"\ 177 "punpckhwd %%mm3, %%mm0 \n\t"\ 178 "pmaddwd %%mm4, %%mm2 \n\t"\ 179 "pmaddwd %%mm4, %%mm0 \n\t"\ 180 "paddd %%mm2, %%mm7 \n\t"\ 181 "paddd %%mm0, %%mm6 \n\t"\ 183 "psrad $16, %%mm1 \n\t"\ 184 "psrad $16, %%mm5 \n\t"\ 185 "psrad $16, %%mm7 \n\t"\ 186 "psrad $16, %%mm6 \n\t"\ 187 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 188 "packssdw %%mm5, %%mm1 \n\t"\ 189 "packssdw %%mm6, %%mm7 \n\t"\ 190 "paddw %%mm0, %%mm1 \n\t"\ 191 "paddw %%mm0, %%mm7 \n\t"\ 192 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 193 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 195 #define YSCALEYUV2PACKEDX_ACCURATE \ 196 YSCALEYUV2PACKEDX_ACCURATE_UV \ 197 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 199 #define YSCALEYUV2RGBX \ 200 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \ 201 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \ 202 "movq %%mm3, %%mm2 \n\t" \ 203 "movq %%mm4, %%mm5 \n\t" \ 204 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 205 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 207 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 208 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 209 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \ 210 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \ 211 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 212 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 214 "paddw %%mm3, %%mm4 \n\t"\ 215 "movq %%mm2, %%mm0 \n\t"\ 216 "movq %%mm5, %%mm6 \n\t"\ 217 "movq %%mm4, %%mm3 \n\t"\ 218 "punpcklwd %%mm2, %%mm2 \n\t"\ 219 "punpcklwd %%mm5, %%mm5 \n\t"\ 220 "punpcklwd %%mm4, %%mm4 \n\t"\ 221 "paddw %%mm1, %%mm2 \n\t"\ 222 "paddw %%mm1, %%mm5 \n\t"\ 223 "paddw %%mm1, %%mm4 \n\t"\ 224 "punpckhwd %%mm0, %%mm0 \n\t"\ 225 "punpckhwd %%mm6, %%mm6 \n\t"\ 226 "punpckhwd %%mm3, %%mm3 \n\t"\ 227 "paddw %%mm7, %%mm0 \n\t"\ 228 "paddw %%mm7, %%mm6 \n\t"\ 229 "paddw %%mm7, %%mm3 \n\t"\ 231 "packuswb %%mm0, %%mm2 \n\t"\ 232 "packuswb %%mm6, %%mm5 \n\t"\ 233 "packuswb %%mm3, %%mm4 \n\t"\ 235 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 236 "movq "#b", "#q2" \n\t" \ 237 "movq "#r", "#t" \n\t" \ 238 "punpcklbw "#g", "#b" \n\t" \ 239 "punpcklbw "#a", "#r" \n\t" \ 240 "punpckhbw "#g", "#q2" \n\t" \ 241 "punpckhbw "#a", "#t" \n\t" \ 242 "movq "#b", "#q0" \n\t" \ 243 "movq "#q2", "#q3" \n\t" \ 244 "punpcklwd "#r", "#q0" \n\t" \ 245 "punpckhwd "#r", "#b" \n\t" \ 246 "punpcklwd "#t", "#q2" \n\t" \ 247 "punpckhwd "#t", "#q3" \n\t" \ 249 MOVNTQ( q0, (dst, index, 4))\ 250 MOVNTQ( b, 8(dst, index, 4))\ 251 MOVNTQ( q2, 16(dst, index, 4))\ 252 MOVNTQ( q3, 24(dst, index, 4))\ 254 "add $8, "#index" \n\t"\ 255 "cmp "#dstw", "#index" \n\t"\ 257 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 260 const int16_t **lumSrc,
int lumFilterSize,
261 const int16_t *chrFilter,
const int16_t **chrUSrc,
262 const int16_t **chrVSrc,
263 int chrFilterSize,
const int16_t **alpSrc,
268 x86_reg uv_off = c->uv_off_byte;
273 "movq %%mm2, "U_TEMP"(%0) \n\t" 274 "movq %%mm4, "V_TEMP"(%0) \n\t" 275 "movq %%mm5, "Y_TEMP"(%0) \n\t" 277 "movq "Y_TEMP"(%0), %%mm5 \n\t" 278 "psraw $3, %%mm1 \n\t" 279 "psraw $3, %%mm7 \n\t" 280 "packuswb %%mm7, %%mm1 \n\t" 281 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
286 "pcmpeqd %%mm7, %%mm7 \n\t" 287 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
293 const int16_t **lumSrc,
int lumFilterSize,
294 const int16_t *chrFilter,
const int16_t **chrUSrc,
295 const int16_t **chrVSrc,
296 int chrFilterSize,
const int16_t **alpSrc,
301 x86_reg uv_off = c->uv_off_byte;
307 "psraw $3, %%mm1 \n\t" 308 "psraw $3, %%mm7 \n\t" 309 "packuswb %%mm7, %%mm1 \n\t" 310 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
315 "pcmpeqd %%mm7, %%mm7 \n\t" 316 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
321 #define REAL_WRITERGB16(dst, dstw, index) \ 322 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 323 "pand "MANGLE(bFC)", %%mm4 \n\t" \ 324 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 325 "psrlq $3, %%mm2 \n\t"\ 327 "movq %%mm2, %%mm1 \n\t"\ 328 "movq %%mm4, %%mm3 \n\t"\ 330 "punpcklbw %%mm7, %%mm3 \n\t"\ 331 "punpcklbw %%mm5, %%mm2 \n\t"\ 332 "punpckhbw %%mm7, %%mm4 \n\t"\ 333 "punpckhbw %%mm5, %%mm1 \n\t"\ 335 "psllq $3, %%mm3 \n\t"\ 336 "psllq $3, %%mm4 \n\t"\ 338 "por %%mm3, %%mm2 \n\t"\ 339 "por %%mm4, %%mm1 \n\t"\ 341 MOVNTQ(%%mm2, (dst, index, 2))\ 342 MOVNTQ(%%mm1, 8(dst, index, 2))\ 344 "add $8, "#index" \n\t"\ 345 "cmp "#dstw", "#index" \n\t"\ 347 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 350 const int16_t **lumSrc,
int lumFilterSize,
351 const int16_t *chrFilter,
const int16_t **chrUSrc,
352 const int16_t **chrVSrc,
353 int chrFilterSize,
const int16_t **alpSrc,
358 x86_reg uv_off = c->uv_off_byte;
362 "pxor %%mm7, %%mm7 \n\t" 374 const int16_t **lumSrc,
int lumFilterSize,
375 const int16_t *chrFilter,
const int16_t **chrUSrc,
376 const int16_t **chrVSrc,
377 int chrFilterSize,
const int16_t **alpSrc,
382 x86_reg uv_off = c->uv_off_byte;
386 "pxor %%mm7, %%mm7 \n\t" 397 #define REAL_WRITERGB15(dst, dstw, index) \ 398 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 399 "pand "MANGLE(bF8)", %%mm4 \n\t" \ 400 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 401 "psrlq $3, %%mm2 \n\t"\ 402 "psrlq $1, %%mm5 \n\t"\ 404 "movq %%mm2, %%mm1 \n\t"\ 405 "movq %%mm4, %%mm3 \n\t"\ 407 "punpcklbw %%mm7, %%mm3 \n\t"\ 408 "punpcklbw %%mm5, %%mm2 \n\t"\ 409 "punpckhbw %%mm7, %%mm4 \n\t"\ 410 "punpckhbw %%mm5, %%mm1 \n\t"\ 412 "psllq $2, %%mm3 \n\t"\ 413 "psllq $2, %%mm4 \n\t"\ 415 "por %%mm3, %%mm2 \n\t"\ 416 "por %%mm4, %%mm1 \n\t"\ 418 MOVNTQ(%%mm2, (dst, index, 2))\ 419 MOVNTQ(%%mm1, 8(dst, index, 2))\ 421 "add $8, "#index" \n\t"\ 422 "cmp "#dstw", "#index" \n\t"\ 424 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 427 const int16_t **lumSrc,
int lumFilterSize,
428 const int16_t *chrFilter,
const int16_t **chrUSrc,
429 const int16_t **chrVSrc,
430 int chrFilterSize,
const int16_t **alpSrc,
435 x86_reg uv_off = c->uv_off_byte;
439 "pxor %%mm7, %%mm7 \n\t" 451 const int16_t **lumSrc,
int lumFilterSize,
452 const int16_t *chrFilter,
const int16_t **chrUSrc,
453 const int16_t **chrVSrc,
454 int chrFilterSize,
const int16_t **alpSrc,
459 x86_reg uv_off = c->uv_off_byte;
463 "pxor %%mm7, %%mm7 \n\t" 474 #define WRITEBGR24MMX(dst, dstw, index) \ 476 "movq %%mm2, %%mm1 \n\t" \ 477 "movq %%mm5, %%mm6 \n\t" \ 478 "punpcklbw %%mm4, %%mm2 \n\t" \ 479 "punpcklbw %%mm7, %%mm5 \n\t" \ 480 "punpckhbw %%mm4, %%mm1 \n\t" \ 481 "punpckhbw %%mm7, %%mm6 \n\t" \ 482 "movq %%mm2, %%mm0 \n\t" \ 483 "movq %%mm1, %%mm3 \n\t" \ 484 "punpcklwd %%mm5, %%mm0 \n\t" \ 485 "punpckhwd %%mm5, %%mm2 \n\t" \ 486 "punpcklwd %%mm6, %%mm1 \n\t" \ 487 "punpckhwd %%mm6, %%mm3 \n\t" \ 489 "movq %%mm0, %%mm4 \n\t" \ 490 "movq %%mm2, %%mm6 \n\t" \ 491 "movq %%mm1, %%mm5 \n\t" \ 492 "movq %%mm3, %%mm7 \n\t" \ 494 "psllq $40, %%mm0 \n\t" \ 495 "psllq $40, %%mm2 \n\t" \ 496 "psllq $40, %%mm1 \n\t" \ 497 "psllq $40, %%mm3 \n\t" \ 499 "punpckhdq %%mm4, %%mm0 \n\t" \ 500 "punpckhdq %%mm6, %%mm2 \n\t" \ 501 "punpckhdq %%mm5, %%mm1 \n\t" \ 502 "punpckhdq %%mm7, %%mm3 \n\t" \ 504 "psrlq $8, %%mm0 \n\t" \ 505 "movq %%mm2, %%mm6 \n\t" \ 506 "psllq $40, %%mm2 \n\t" \ 507 "por %%mm2, %%mm0 \n\t" \ 508 MOVNTQ(%%mm0, (dst))\ 510 "psrlq $24, %%mm6 \n\t" \ 511 "movq %%mm1, %%mm5 \n\t" \ 512 "psllq $24, %%mm1 \n\t" \ 513 "por %%mm1, %%mm6 \n\t" \ 514 MOVNTQ(%%mm6, 8(dst))\ 516 "psrlq $40, %%mm5 \n\t" \ 517 "psllq $8, %%mm3 \n\t" \ 518 "por %%mm3, %%mm5 \n\t" \ 519 MOVNTQ(%%mm5, 16(dst))\ 521 "add $24, "#dst" \n\t"\ 523 "add $8, "#index" \n\t"\ 524 "cmp "#dstw", "#index" \n\t"\ 527 #define WRITEBGR24MMXEXT(dst, dstw, index) \ 529 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 530 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 531 "pshufw $0x50, %%mm2, %%mm1 \n\t" \ 532 "pshufw $0x50, %%mm4, %%mm3 \n\t" \ 533 "pshufw $0x00, %%mm5, %%mm6 \n\t" \ 535 "pand %%mm0, %%mm1 \n\t" \ 536 "pand %%mm0, %%mm3 \n\t" \ 537 "pand %%mm7, %%mm6 \n\t" \ 539 "psllq $8, %%mm3 \n\t" \ 540 "por %%mm1, %%mm6 \n\t"\ 541 "por %%mm3, %%mm6 \n\t"\ 542 MOVNTQ(%%mm6, (dst))\ 544 "psrlq $8, %%mm4 \n\t" \ 545 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \ 546 "pshufw $0x55, %%mm4, %%mm3 \n\t" \ 547 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \ 549 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \ 550 "pand %%mm7, %%mm3 \n\t" \ 551 "pand %%mm0, %%mm6 \n\t" \ 553 "por %%mm1, %%mm3 \n\t" \ 554 "por %%mm3, %%mm6 \n\t"\ 555 MOVNTQ(%%mm6, 8(dst))\ 557 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \ 558 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \ 559 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \ 561 "pand %%mm7, %%mm1 \n\t" \ 562 "pand %%mm0, %%mm3 \n\t" \ 563 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \ 565 "por %%mm1, %%mm3 \n\t"\ 566 "por %%mm3, %%mm6 \n\t"\ 567 MOVNTQ(%%mm6, 16(dst))\ 569 "add $24, "#dst" \n\t"\ 571 "add $8, "#index" \n\t"\ 572 "cmp "#dstw", "#index" \n\t"\ 575 #if COMPILE_TEMPLATE_MMXEXT 577 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 580 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 584 const int16_t **lumSrc,
int lumFilterSize,
585 const int16_t *chrFilter,
const int16_t **chrUSrc,
586 const int16_t **chrVSrc,
587 int chrFilterSize,
const int16_t **alpSrc,
592 x86_reg uv_off = c->uv_off_byte;
596 "pxor %%mm7, %%mm7 \n\t" 597 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" 598 "add %4, %%"REG_c
" \n\t" 600 ::
"r" (&c->redDither),
601 "m" (dummy),
"m" (dummy),
"m" (dummy),
602 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
603 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
608 const int16_t **lumSrc,
int lumFilterSize,
609 const int16_t *chrFilter,
const int16_t **chrUSrc,
610 const int16_t **chrVSrc,
611 int chrFilterSize,
const int16_t **alpSrc,
616 x86_reg uv_off = c->uv_off_byte;
620 "pxor %%mm7, %%mm7 \n\t" 621 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" 622 "add %4, %%"REG_c
" \n\t" 624 ::
"r" (&c->redDither),
625 "m" (dummy),
"m" (dummy),
"m" (dummy),
626 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
627 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
631 #define REAL_WRITEYUY2(dst, dstw, index) \ 632 "packuswb %%mm3, %%mm3 \n\t"\ 633 "packuswb %%mm4, %%mm4 \n\t"\ 634 "packuswb %%mm7, %%mm1 \n\t"\ 635 "punpcklbw %%mm4, %%mm3 \n\t"\ 636 "movq %%mm1, %%mm7 \n\t"\ 637 "punpcklbw %%mm3, %%mm1 \n\t"\ 638 "punpckhbw %%mm3, %%mm7 \n\t"\ 640 MOVNTQ(%%mm1, (dst, index, 2))\ 641 MOVNTQ(%%mm7, 8(dst, index, 2))\ 643 "add $8, "#index" \n\t"\ 644 "cmp "#dstw", "#index" \n\t"\ 646 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 649 const int16_t **lumSrc,
int lumFilterSize,
650 const int16_t *chrFilter,
const int16_t **chrUSrc,
651 const int16_t **chrVSrc,
652 int chrFilterSize,
const int16_t **alpSrc,
657 x86_reg uv_off = c->uv_off_byte;
661 "psraw $3, %%mm3 \n\t" 662 "psraw $3, %%mm4 \n\t" 663 "psraw $3, %%mm1 \n\t" 664 "psraw $3, %%mm7 \n\t" 670 const int16_t **lumSrc,
int lumFilterSize,
671 const int16_t *chrFilter,
const int16_t **chrUSrc,
672 const int16_t **chrVSrc,
673 int chrFilterSize,
const int16_t **alpSrc,
678 x86_reg uv_off = c->uv_off_byte;
682 "psraw $3, %%mm3 \n\t" 683 "psraw $3, %%mm4 \n\t" 684 "psraw $3, %%mm1 \n\t" 685 "psraw $3, %%mm7 \n\t" 690 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 691 "xor "#index", "#index" \n\t"\ 694 "movq (%2, "#index"), %%mm2 \n\t" \ 695 "movq (%3, "#index"), %%mm3 \n\t" \ 696 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 697 "movq (%2, "#index"), %%mm5 \n\t" \ 698 "movq (%3, "#index"), %%mm4 \n\t" \ 699 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 700 "psubw %%mm3, %%mm2 \n\t" \ 701 "psubw %%mm4, %%mm5 \n\t" \ 702 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 703 "pmulhw %%mm0, %%mm2 \n\t" \ 704 "pmulhw %%mm0, %%mm5 \n\t" \ 705 "psraw $4, %%mm3 \n\t" \ 706 "psraw $4, %%mm4 \n\t" \ 707 "paddw %%mm2, %%mm3 \n\t" \ 708 "paddw %%mm5, %%mm4 \n\t" \ 709 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 710 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 711 "movq %%mm3, %%mm2 \n\t" \ 712 "movq %%mm4, %%mm5 \n\t" \ 713 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 714 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 717 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 718 "movq ("#b1", "#index", 2), %%mm0 \n\t" \ 719 "movq ("#b2", "#index", 2), %%mm1 \n\t" \ 720 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \ 721 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \ 722 "psubw %%mm1, %%mm0 \n\t" \ 723 "psubw %%mm7, %%mm6 \n\t" \ 724 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 725 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 726 "psraw $4, %%mm1 \n\t" \ 727 "psraw $4, %%mm7 \n\t" \ 728 "paddw %%mm0, %%mm1 \n\t" \ 729 "paddw %%mm6, %%mm7 \n\t" \ 731 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 732 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 733 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 734 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 735 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 736 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 737 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 739 "paddw %%mm3, %%mm4 \n\t"\ 740 "movq %%mm2, %%mm0 \n\t"\ 741 "movq %%mm5, %%mm6 \n\t"\ 742 "movq %%mm4, %%mm3 \n\t"\ 743 "punpcklwd %%mm2, %%mm2 \n\t"\ 744 "punpcklwd %%mm5, %%mm5 \n\t"\ 745 "punpcklwd %%mm4, %%mm4 \n\t"\ 746 "paddw %%mm1, %%mm2 \n\t"\ 747 "paddw %%mm1, %%mm5 \n\t"\ 748 "paddw %%mm1, %%mm4 \n\t"\ 749 "punpckhwd %%mm0, %%mm0 \n\t"\ 750 "punpckhwd %%mm6, %%mm6 \n\t"\ 751 "punpckhwd %%mm3, %%mm3 \n\t"\ 752 "paddw %%mm7, %%mm0 \n\t"\ 753 "paddw %%mm7, %%mm6 \n\t"\ 754 "paddw %%mm7, %%mm3 \n\t"\ 756 "packuswb %%mm0, %%mm2 \n\t"\ 757 "packuswb %%mm6, %%mm5 \n\t"\ 758 "packuswb %%mm3, %%mm4 \n\t"\ 760 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 762 #define YSCALEYUV2RGB(index, c) \ 763 REAL_YSCALEYUV2RGB_UV(index, c) \ 764 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 765 REAL_YSCALEYUV2RGB_COEFF(c) 771 const int16_t *ubuf[2],
const int16_t *vbuf[2],
772 const int16_t *abuf[2],
uint8_t *dest,
773 int dstW,
int yalpha,
int uvalpha,
int y)
775 const int16_t *buf0 = buf[0], *buf1 = buf[1],
776 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
779 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
784 "psraw $3, %%mm1 \n\t" 785 "psraw $3, %%mm7 \n\t" 786 "packuswb %%mm7, %%mm1 \n\t" 787 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
788 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
790 "r" (abuf0),
"r" (abuf1)
794 *(
const uint16_t **)(&c->u_temp)=abuf0;
795 *(
const uint16_t **)(&c->v_temp)=abuf1;
798 "mov %4, %%"REG_b
" \n\t" 799 "push %%"REG_BP
" \n\t" 803 "mov "U_TEMP"(%5), %0 \n\t" 804 "mov "V_TEMP"(%5), %1 \n\t" 806 "psraw $3, %%mm1 \n\t" 807 "psraw $3, %%mm7 \n\t" 808 "packuswb %%mm7, %%mm1 \n\t" 811 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
812 "pop %%"REG_BP
" \n\t" 814 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
821 "mov %4, %%"REG_b
" \n\t" 822 "push %%"REG_BP
" \n\t" 824 "pcmpeqd %%mm7, %%mm7 \n\t" 825 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
826 "pop %%"REG_BP
" \n\t" 828 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
835 const int16_t *ubuf[2],
const int16_t *vbuf[2],
836 const int16_t *abuf[2],
uint8_t *dest,
837 int dstW,
int yalpha,
int uvalpha,
int y)
839 const int16_t *buf0 = buf[0], *buf1 = buf[1],
840 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
845 "mov %4, %%"REG_b
" \n\t" 846 "push %%"REG_BP
" \n\t" 848 "pxor %%mm7, %%mm7 \n\t" 850 "pop %%"REG_BP
" \n\t" 852 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
858 const int16_t *ubuf[2],
const int16_t *vbuf[2],
859 const int16_t *abuf[2],
uint8_t *dest,
860 int dstW,
int yalpha,
int uvalpha,
int y)
862 const int16_t *buf0 = buf[0], *buf1 = buf[1],
863 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
868 "mov %4, %%"REG_b
" \n\t" 869 "push %%"REG_BP
" \n\t" 871 "pxor %%mm7, %%mm7 \n\t" 879 "pop %%"REG_BP
" \n\t" 881 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
887 const int16_t *ubuf[2],
const int16_t *vbuf[2],
888 const int16_t *abuf[2],
uint8_t *dest,
889 int dstW,
int yalpha,
int uvalpha,
int y)
891 const int16_t *buf0 = buf[0], *buf1 = buf[1],
892 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
897 "mov %4, %%"REG_b
" \n\t" 898 "push %%"REG_BP
" \n\t" 900 "pxor %%mm7, %%mm7 \n\t" 908 "pop %%"REG_BP
" \n\t" 910 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
915 #define REAL_YSCALEYUV2PACKED(index, c) \ 916 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 917 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 918 "psraw $3, %%mm0 \n\t"\ 919 "psraw $3, %%mm1 \n\t"\ 920 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 921 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 922 "xor "#index", "#index" \n\t"\ 925 "movq (%2, "#index"), %%mm2 \n\t" \ 926 "movq (%3, "#index"), %%mm3 \n\t" \ 927 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 928 "movq (%2, "#index"), %%mm5 \n\t" \ 929 "movq (%3, "#index"), %%mm4 \n\t" \ 930 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 931 "psubw %%mm3, %%mm2 \n\t" \ 932 "psubw %%mm4, %%mm5 \n\t" \ 933 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 934 "pmulhw %%mm0, %%mm2 \n\t" \ 935 "pmulhw %%mm0, %%mm5 \n\t" \ 936 "psraw $7, %%mm3 \n\t" \ 937 "psraw $7, %%mm4 \n\t" \ 938 "paddw %%mm2, %%mm3 \n\t" \ 939 "paddw %%mm5, %%mm4 \n\t" \ 940 "movq (%0, "#index", 2), %%mm0 \n\t" \ 941 "movq (%1, "#index", 2), %%mm1 \n\t" \ 942 "movq 8(%0, "#index", 2), %%mm6 \n\t" \ 943 "movq 8(%1, "#index", 2), %%mm7 \n\t" \ 944 "psubw %%mm1, %%mm0 \n\t" \ 945 "psubw %%mm7, %%mm6 \n\t" \ 946 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 947 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 948 "psraw $7, %%mm1 \n\t" \ 949 "psraw $7, %%mm7 \n\t" \ 950 "paddw %%mm0, %%mm1 \n\t" \ 951 "paddw %%mm6, %%mm7 \n\t" \ 953 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 956 const int16_t *ubuf[2],
const int16_t *vbuf[2],
957 const int16_t *abuf[2],
uint8_t *dest,
958 int dstW,
int yalpha,
int uvalpha,
int y)
960 const int16_t *buf0 = buf[0], *buf1 = buf[1],
961 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
966 "mov %4, %%"REG_b
" \n\t" 967 "push %%"REG_BP
" \n\t" 970 "pop %%"REG_BP
" \n\t" 972 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
977 #define REAL_YSCALEYUV2RGB1(index, c) \ 978 "xor "#index", "#index" \n\t"\ 981 "movq (%2, "#index"), %%mm3 \n\t" \ 982 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 983 "movq (%2, "#index"), %%mm4 \n\t" \ 984 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 985 "psraw $4, %%mm3 \n\t" \ 986 "psraw $4, %%mm4 \n\t" \ 987 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 988 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 989 "movq %%mm3, %%mm2 \n\t" \ 990 "movq %%mm4, %%mm5 \n\t" \ 991 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 992 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 994 "movq (%0, "#index", 2), %%mm1 \n\t" \ 995 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 996 "psraw $4, %%mm1 \n\t" \ 997 "psraw $4, %%mm7 \n\t" \ 998 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 999 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1000 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1001 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1002 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1003 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1005 "paddw %%mm3, %%mm4 \n\t"\ 1006 "movq %%mm2, %%mm0 \n\t"\ 1007 "movq %%mm5, %%mm6 \n\t"\ 1008 "movq %%mm4, %%mm3 \n\t"\ 1009 "punpcklwd %%mm2, %%mm2 \n\t"\ 1010 "punpcklwd %%mm5, %%mm5 \n\t"\ 1011 "punpcklwd %%mm4, %%mm4 \n\t"\ 1012 "paddw %%mm1, %%mm2 \n\t"\ 1013 "paddw %%mm1, %%mm5 \n\t"\ 1014 "paddw %%mm1, %%mm4 \n\t"\ 1015 "punpckhwd %%mm0, %%mm0 \n\t"\ 1016 "punpckhwd %%mm6, %%mm6 \n\t"\ 1017 "punpckhwd %%mm3, %%mm3 \n\t"\ 1018 "paddw %%mm7, %%mm0 \n\t"\ 1019 "paddw %%mm7, %%mm6 \n\t"\ 1020 "paddw %%mm7, %%mm3 \n\t"\ 1022 "packuswb %%mm0, %%mm2 \n\t"\ 1023 "packuswb %%mm6, %%mm5 \n\t"\ 1024 "packuswb %%mm3, %%mm4 \n\t"\ 1026 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1029 #define REAL_YSCALEYUV2RGB1b(index, c) \ 1030 "xor "#index", "#index" \n\t"\ 1033 "movq (%2, "#index"), %%mm2 \n\t" \ 1034 "movq (%3, "#index"), %%mm3 \n\t" \ 1035 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1036 "movq (%2, "#index"), %%mm5 \n\t" \ 1037 "movq (%3, "#index"), %%mm4 \n\t" \ 1038 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1039 "paddw %%mm2, %%mm3 \n\t" \ 1040 "paddw %%mm5, %%mm4 \n\t" \ 1041 "psrlw $5, %%mm3 \n\t" \ 1042 "psrlw $5, %%mm4 \n\t" \ 1043 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1044 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1045 "movq %%mm3, %%mm2 \n\t" \ 1046 "movq %%mm4, %%mm5 \n\t" \ 1047 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1048 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1050 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1051 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1052 "psraw $4, %%mm1 \n\t" \ 1053 "psraw $4, %%mm7 \n\t" \ 1054 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1055 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1056 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1057 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1058 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1059 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1061 "paddw %%mm3, %%mm4 \n\t"\ 1062 "movq %%mm2, %%mm0 \n\t"\ 1063 "movq %%mm5, %%mm6 \n\t"\ 1064 "movq %%mm4, %%mm3 \n\t"\ 1065 "punpcklwd %%mm2, %%mm2 \n\t"\ 1066 "punpcklwd %%mm5, %%mm5 \n\t"\ 1067 "punpcklwd %%mm4, %%mm4 \n\t"\ 1068 "paddw %%mm1, %%mm2 \n\t"\ 1069 "paddw %%mm1, %%mm5 \n\t"\ 1070 "paddw %%mm1, %%mm4 \n\t"\ 1071 "punpckhwd %%mm0, %%mm0 \n\t"\ 1072 "punpckhwd %%mm6, %%mm6 \n\t"\ 1073 "punpckhwd %%mm3, %%mm3 \n\t"\ 1074 "paddw %%mm7, %%mm0 \n\t"\ 1075 "paddw %%mm7, %%mm6 \n\t"\ 1076 "paddw %%mm7, %%mm3 \n\t"\ 1078 "packuswb %%mm0, %%mm2 \n\t"\ 1079 "packuswb %%mm6, %%mm5 \n\t"\ 1080 "packuswb %%mm3, %%mm4 \n\t"\ 1082 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1084 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1085 "movq (%1, "#index", 2), %%mm7 \n\t" \ 1086 "movq 8(%1, "#index", 2), %%mm1 \n\t" \ 1087 "psraw $7, %%mm7 \n\t" \ 1088 "psraw $7, %%mm1 \n\t" \ 1089 "packuswb %%mm1, %%mm7 \n\t" 1090 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1096 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1097 const int16_t *abuf0,
uint8_t *dest,
1098 int dstW,
int uvalpha,
int y)
1100 const int16_t *ubuf0 = ubuf[0];
1101 const int16_t *buf1= buf0;
1103 if (uvalpha < 2048) {
1104 const int16_t *ubuf1 = ubuf[0];
1108 "mov %4, %%"REG_b
" \n\t" 1109 "push %%"REG_BP
" \n\t" 1112 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1113 "pop %%"REG_BP
" \n\t" 1115 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1121 "mov %4, %%"REG_b
" \n\t" 1122 "push %%"REG_BP
" \n\t" 1124 "pcmpeqd %%mm7, %%mm7 \n\t" 1125 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1126 "pop %%"REG_BP
" \n\t" 1128 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1133 const int16_t *ubuf1 = ubuf[1];
1137 "mov %4, %%"REG_b
" \n\t" 1138 "push %%"REG_BP
" \n\t" 1141 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1142 "pop %%"REG_BP
" \n\t" 1144 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1150 "mov %4, %%"REG_b
" \n\t" 1151 "push %%"REG_BP
" \n\t" 1153 "pcmpeqd %%mm7, %%mm7 \n\t" 1154 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1155 "pop %%"REG_BP
" \n\t" 1157 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1165 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1166 const int16_t *abuf0,
uint8_t *dest,
1167 int dstW,
int uvalpha,
int y)
1169 const int16_t *ubuf0 = ubuf[0];
1170 const int16_t *buf1= buf0;
1172 if (uvalpha < 2048) {
1173 const int16_t *ubuf1 = ubuf[0];
1176 "mov %4, %%"REG_b
" \n\t" 1177 "push %%"REG_BP
" \n\t" 1179 "pxor %%mm7, %%mm7 \n\t" 1181 "pop %%"REG_BP
" \n\t" 1183 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1187 const int16_t *ubuf1 = ubuf[1];
1190 "mov %4, %%"REG_b
" \n\t" 1191 "push %%"REG_BP
" \n\t" 1193 "pxor %%mm7, %%mm7 \n\t" 1195 "pop %%"REG_BP
" \n\t" 1197 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1204 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1205 const int16_t *abuf0,
uint8_t *dest,
1206 int dstW,
int uvalpha,
int y)
1208 const int16_t *ubuf0 = ubuf[0];
1209 const int16_t *buf1= buf0;
1211 if (uvalpha < 2048) {
1212 const int16_t *ubuf1 = ubuf[0];
1215 "mov %4, %%"REG_b
" \n\t" 1216 "push %%"REG_BP
" \n\t" 1218 "pxor %%mm7, %%mm7 \n\t" 1226 "pop %%"REG_BP
" \n\t" 1228 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1232 const int16_t *ubuf1 = ubuf[1];
1235 "mov %4, %%"REG_b
" \n\t" 1236 "push %%"REG_BP
" \n\t" 1238 "pxor %%mm7, %%mm7 \n\t" 1246 "pop %%"REG_BP
" \n\t" 1248 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1255 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1256 const int16_t *abuf0,
uint8_t *dest,
1257 int dstW,
int uvalpha,
int y)
1259 const int16_t *ubuf0 = ubuf[0];
1260 const int16_t *buf1= buf0;
1262 if (uvalpha < 2048) {
1263 const int16_t *ubuf1 = ubuf[0];
1266 "mov %4, %%"REG_b
" \n\t" 1267 "push %%"REG_BP
" \n\t" 1269 "pxor %%mm7, %%mm7 \n\t" 1277 "pop %%"REG_BP
" \n\t" 1279 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1283 const int16_t *ubuf1 = ubuf[1];
1286 "mov %4, %%"REG_b
" \n\t" 1287 "push %%"REG_BP
" \n\t" 1289 "pxor %%mm7, %%mm7 \n\t" 1297 "pop %%"REG_BP
" \n\t" 1299 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1305 #define REAL_YSCALEYUV2PACKED1(index, c) \ 1306 "xor "#index", "#index" \n\t"\ 1309 "movq (%2, "#index"), %%mm3 \n\t" \ 1310 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1311 "movq (%2, "#index"), %%mm4 \n\t" \ 1312 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1313 "psraw $7, %%mm3 \n\t" \ 1314 "psraw $7, %%mm4 \n\t" \ 1315 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1316 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1317 "psraw $7, %%mm1 \n\t" \ 1318 "psraw $7, %%mm7 \n\t" \ 1320 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1322 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 1323 "xor "#index", "#index" \n\t"\ 1326 "movq (%2, "#index"), %%mm2 \n\t" \ 1327 "movq (%3, "#index"), %%mm3 \n\t" \ 1328 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1329 "movq (%2, "#index"), %%mm5 \n\t" \ 1330 "movq (%3, "#index"), %%mm4 \n\t" \ 1331 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1332 "paddw %%mm2, %%mm3 \n\t" \ 1333 "paddw %%mm5, %%mm4 \n\t" \ 1334 "psrlw $8, %%mm3 \n\t" \ 1335 "psrlw $8, %%mm4 \n\t" \ 1336 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1337 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1338 "psraw $7, %%mm1 \n\t" \ 1339 "psraw $7, %%mm7 \n\t" 1340 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1343 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1344 const int16_t *abuf0,
uint8_t *dest,
1345 int dstW,
int uvalpha,
int y)
1347 const int16_t *ubuf0 = ubuf[0];
1348 const int16_t *buf1= buf0;
1350 if (uvalpha < 2048) {
1351 const int16_t *ubuf1 = ubuf[0];
1354 "mov %4, %%"REG_b
" \n\t" 1355 "push %%"REG_BP
" \n\t" 1358 "pop %%"REG_BP
" \n\t" 1360 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1364 const int16_t *ubuf1 = ubuf[1];
1367 "mov %4, %%"REG_b
" \n\t" 1368 "push %%"REG_BP
" \n\t" 1371 "pop %%"REG_BP
" \n\t" 1373 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1379 #if COMPILE_TEMPLATE_MMXEXT 1381 int dstWidth,
const uint8_t *src,
1384 int32_t *filterPos = c->hLumFilterPos;
1385 int16_t *
filter = c->hLumFilter;
1386 void *mmxextFilterCode = c->lumMmxextFilterCode;
1397 "mov %%"REG_b
", %5 \n\t" 1399 "mov -8(%%rsp), %%"REG_a
" \n\t" 1400 "mov %%"REG_a
", %6 \n\t" 1404 "mov -8(%%rsp), %%"REG_a
" \n\t" 1405 "mov %%"REG_a
", %5 \n\t" 1408 "pxor %%mm7, %%mm7 \n\t" 1409 "mov %0, %%"REG_c
" \n\t" 1410 "mov %1, %%"REG_D
" \n\t" 1411 "mov %2, %%"REG_d
" \n\t" 1412 "mov %3, %%"REG_b
" \n\t" 1413 "xor %%"REG_a
", %%"REG_a
" \n\t" 1419 #define CALL_MMXEXT_FILTER_CODE \ 1420 "movl (%%"REG_b"), %%esi \n\t"\ 1422 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 1423 "add %%"REG_S", %%"REG_c" \n\t"\ 1424 "add %%"REG_a", %%"REG_D" \n\t"\ 1425 "xor %%"REG_a", %%"REG_a" \n\t"\ 1428 #define CALL_MMXEXT_FILTER_CODE \ 1429 "movl (%%"REG_b"), %%esi \n\t"\ 1431 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 1432 "add %%"REG_a", %%"REG_D" \n\t"\ 1433 "xor %%"REG_a", %%"REG_a" \n\t"\ 1437 CALL_MMXEXT_FILTER_CODE
1438 CALL_MMXEXT_FILTER_CODE
1439 CALL_MMXEXT_FILTER_CODE
1440 CALL_MMXEXT_FILTER_CODE
1441 CALL_MMXEXT_FILTER_CODE
1442 CALL_MMXEXT_FILTER_CODE
1443 CALL_MMXEXT_FILTER_CODE
1444 CALL_MMXEXT_FILTER_CODE
1447 "mov %5, %%"REG_b
" \n\t" 1449 "mov %6, %%"REG_a
" \n\t" 1450 "mov %%"REG_a
", -8(%%rsp) \n\t" 1454 "mov %5, %%"REG_a
" \n\t" 1455 "mov %%"REG_a
", -8(%%rsp) \n\t" 1458 ::
"m" (src),
"m" (dst),
"m" (
filter),
"m" (filterPos),
1459 "m" (mmxextFilterCode)
1466 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1472 for (i=dstWidth-1; (i*xInc)>>16 >=
srcW-1; i--)
1473 dst[i] = src[
srcW-1]*128;
1477 int dstWidth,
const uint8_t *src1,
1480 int32_t *filterPos = c->hChrFilterPos;
1481 int16_t *
filter = c->hChrFilter;
1482 void *mmxextFilterCode = c->chrMmxextFilterCode;
1493 "mov %%"REG_b
", %7 \n\t" 1495 "mov -8(%%rsp), %%"REG_a
" \n\t" 1496 "mov %%"REG_a
", %8 \n\t" 1500 "mov -8(%%rsp), %%"REG_a
" \n\t" 1501 "mov %%"REG_a
", %7 \n\t" 1504 "pxor %%mm7, %%mm7 \n\t" 1505 "mov %0, %%"REG_c
" \n\t" 1506 "mov %1, %%"REG_D
" \n\t" 1507 "mov %2, %%"REG_d
" \n\t" 1508 "mov %3, %%"REG_b
" \n\t" 1509 "xor %%"REG_a
", %%"REG_a
" \n\t" 1514 CALL_MMXEXT_FILTER_CODE
1515 CALL_MMXEXT_FILTER_CODE
1516 CALL_MMXEXT_FILTER_CODE
1517 CALL_MMXEXT_FILTER_CODE
1518 "xor %%"REG_a
", %%"REG_a
" \n\t" 1519 "mov %5, %%"REG_c
" \n\t" 1520 "mov %6, %%"REG_D
" \n\t" 1525 CALL_MMXEXT_FILTER_CODE
1526 CALL_MMXEXT_FILTER_CODE
1527 CALL_MMXEXT_FILTER_CODE
1528 CALL_MMXEXT_FILTER_CODE
1531 "mov %7, %%"REG_b
" \n\t" 1533 "mov %8, %%"REG_a
" \n\t" 1534 "mov %%"REG_a
", -8(%%rsp) \n\t" 1538 "mov %7, %%"REG_a
" \n\t" 1539 "mov %%"REG_a
", -8(%%rsp) \n\t" 1542 ::
"m" (src1),
"m" (dst1),
"m" (
filter),
"m" (filterPos),
1543 "m" (mmxextFilterCode),
"m" (src2),
"m"(dst2)
1550 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1556 for (i=dstWidth-1; (i*xInc)>>16 >=
srcW-1; i--) {
1557 dst1[i] = src1[
srcW-1]*128;
1558 dst2[i] = src2[
srcW-1]*128;
1572 switch (c->dstFormat) {
1583 switch (c->dstFormat) {
1595 switch (c->dstFormat) {
1622 if (c->srcBpc == 8 && c->dstBpc <= 10) {
1624 #if COMPILE_TEMPLATE_MMXEXT 1630 c->hyscale_fast =
NULL;
1631 c->hcscale_fast =
NULL;
1632 #if COMPILE_TEMPLATE_MMXEXT #define YSCALEYUV2RGB1_ALPHA(index)
#define ALP_MMX_FILTER_OFFSET
static void RENAME() yuv2rgb32_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKED1(index, c)
static void RENAME() yuv2yuyv422_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define DECLARE_ALIGNED(n, t, v)
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyscale_fast)(struct SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Scale one horizontal line of input data using a bilinear filter to produce one line of output data...
int dstY
Last destination vertical line output from last slice.
#define YSCALEYUV2PACKEDX_END
#define SWS_FULL_CHR_H_INT
#define SWS_FAST_BILINEAR
enum AVPixelFormat dstFormat
Destination pixel format.
#define WRITERGB15(dst, dstw, index)
static void RENAME() yuv2rgb565_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb565_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKEDX
static void RENAME() yuv2rgb32_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
YV12 to RGB without scaling or interpolating.
#define WRITEBGR24(dst, dstw, index)
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
#define YSCALEYUV2RGB1b(index, c)
void(* hcscale_fast)(struct SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2PACKED(index, c)
static void RENAME() yuv2rgb565_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define WRITERGB16(dst, dstw, index)
static void filter(MpegAudioContext *s, int ch, const short *samples, int incr)
as above, but U and V bytes are swapped
#define YSCALEYUV2PACKEDX_ACCURATE
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
static void RENAME() yuv2rgb555_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
static void RENAME() yuv2yuyv422_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static av_cold void RENAME() sws_init_swscale(SwsContext *c)
packed RGB 8:8:8, 24bpp, BGRBGR...
int dstW
Width of destination luma/alpha planes.
static void RENAME() yuv2yuyv422_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2RGB(index, c)
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
#define YSCALEYUV2PACKED1b(index, c)
static void RENAME() yuv2rgb555_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2yuyv422_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb32_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
vertical bilinear scale YV12 to RGB
#define AV_PIX_FMT_RGB555
static void RENAME() yuv2rgb565_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
#define AV_PIX_FMT_RGB565
static void RENAME() yuv2bgr24_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
#define WRITEYUY2(dst, dstw, index)
int srcW
Width of source luma/alpha planes.
AVPixelFormat
Pixel format.
static void RENAME() yuv2bgr24_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2RGB1(index, c)