42 "pxor %%mm0, %%mm0 \n" 43 "pxor %%mm7, %%mm7 \n" 47 "movq (%0, %3), %%mm3 \n" 48 "movq (%1, %3), %%mm4 \n" 53 "movq %%mm1, %%mm5 \n" 54 "movq %%mm3, %%mm6 \n" 55 "psubusb %%mm2, %%mm1 \n" 56 "psubusb %%mm4, %%mm3 \n" 57 "psubusb %%mm5, %%mm2 \n" 58 "psubusb %%mm6, %%mm4 \n" 64 "movq %%mm2, %%mm1 \n" 65 "movq %%mm4, %%mm3 \n" 67 "punpckhbw %%mm0, %%mm2 \n" 68 "punpckhbw %%mm0, %%mm4 \n" 69 "punpcklbw %%mm0, %%mm1 \n" 70 "punpcklbw %%mm0, %%mm3 \n" 72 "pmaddwd %%mm2, %%mm2 \n" 73 "pmaddwd %%mm4, %%mm4 \n" 74 "pmaddwd %%mm1, %%mm1 \n" 75 "pmaddwd %%mm3, %%mm3 \n" 77 "lea (%0, %3, 2), %0 \n" 78 "lea (%1, %3, 2), %1 \n" 80 "paddd %%mm2, %%mm1 \n" 81 "paddd %%mm4, %%mm3 \n" 82 "paddd %%mm1, %%mm7 \n" 83 "paddd %%mm3, %%mm7 \n" 88 "movq %%mm7, %%mm1 \n" 90 "paddd %%mm7, %%mm1 \n" 92 :
"+r" (pix1),
"+r" (pix2),
"=r" (tmp)
93 :
"r" ((
x86_reg) line_size),
"m" (h)
100 int line_size,
int h)
106 "pxor %%mm0, %%mm0\n" 107 "pxor %%mm7, %%mm7\n" 111 "movq 8(%0), %%mm3\n" 112 "movq 8(%1), %%mm4\n" 117 "movq %%mm1, %%mm5\n" 118 "movq %%mm3, %%mm6\n" 119 "psubusb %%mm2, %%mm1\n" 120 "psubusb %%mm4, %%mm3\n" 121 "psubusb %%mm5, %%mm2\n" 122 "psubusb %%mm6, %%mm4\n" 128 "movq %%mm2, %%mm1\n" 129 "movq %%mm4, %%mm3\n" 131 "punpckhbw %%mm0, %%mm2\n" 132 "punpckhbw %%mm0, %%mm4\n" 133 "punpcklbw %%mm0, %%mm1\n" 134 "punpcklbw %%mm0, %%mm3\n" 136 "pmaddwd %%mm2, %%mm2\n" 137 "pmaddwd %%mm4, %%mm4\n" 138 "pmaddwd %%mm1, %%mm1\n" 139 "pmaddwd %%mm3, %%mm3\n" 144 "paddd %%mm2, %%mm1\n" 145 "paddd %%mm4, %%mm3\n" 146 "paddd %%mm1, %%mm7\n" 147 "paddd %%mm3, %%mm7\n" 152 "movq %%mm7, %%mm1\n" 154 "paddd %%mm7, %%mm1\n" 156 :
"+r" (pix1),
"+r" (pix2),
"=r" (tmp)
157 :
"r" ((
x86_reg) line_size),
"m" (h)
163 static int hf_noise8_mmx(
uint8_t *pix1,
int line_size,
int h)
169 "pxor %%mm7, %%mm7\n" 170 "pxor %%mm6, %%mm6\n" 173 "movq %%mm0, %%mm1\n" 177 "movq %%mm0, %%mm2\n" 178 "movq %%mm1, %%mm3\n" 179 "punpcklbw %%mm7, %%mm0\n" 180 "punpcklbw %%mm7, %%mm1\n" 181 "punpckhbw %%mm7, %%mm2\n" 182 "punpckhbw %%mm7, %%mm3\n" 183 "psubw %%mm1, %%mm0\n" 184 "psubw %%mm3, %%mm2\n" 189 "movq %%mm4, %%mm1\n" 193 "movq %%mm4, %%mm5\n" 194 "movq %%mm1, %%mm3\n" 195 "punpcklbw %%mm7, %%mm4\n" 196 "punpcklbw %%mm7, %%mm1\n" 197 "punpckhbw %%mm7, %%mm5\n" 198 "punpckhbw %%mm7, %%mm3\n" 199 "psubw %%mm1, %%mm4\n" 200 "psubw %%mm3, %%mm5\n" 201 "psubw %%mm4, %%mm0\n" 202 "psubw %%mm5, %%mm2\n" 203 "pxor %%mm3, %%mm3\n" 204 "pxor %%mm1, %%mm1\n" 205 "pcmpgtw %%mm0, %%mm3\n\t" 206 "pcmpgtw %%mm2, %%mm1\n\t" 207 "pxor %%mm3, %%mm0\n" 208 "pxor %%mm1, %%mm2\n" 209 "psubw %%mm3, %%mm0\n" 210 "psubw %%mm1, %%mm2\n" 211 "paddw %%mm0, %%mm2\n" 212 "paddw %%mm2, %%mm6\n" 218 "movq %%mm0, %%mm1\n" 222 "movq %%mm0, %%mm2\n" 223 "movq %%mm1, %%mm3\n" 224 "punpcklbw %%mm7, %%mm0\n" 225 "punpcklbw %%mm7, %%mm1\n" 226 "punpckhbw %%mm7, %%mm2\n" 227 "punpckhbw %%mm7, %%mm3\n" 228 "psubw %%mm1, %%mm0\n" 229 "psubw %%mm3, %%mm2\n" 230 "psubw %%mm0, %%mm4\n" 231 "psubw %%mm2, %%mm5\n" 232 "pxor %%mm3, %%mm3\n" 233 "pxor %%mm1, %%mm1\n" 234 "pcmpgtw %%mm4, %%mm3\n\t" 235 "pcmpgtw %%mm5, %%mm1\n\t" 236 "pxor %%mm3, %%mm4\n" 237 "pxor %%mm1, %%mm5\n" 238 "psubw %%mm3, %%mm4\n" 239 "psubw %%mm1, %%mm5\n" 240 "paddw %%mm4, %%mm5\n" 241 "paddw %%mm5, %%mm6\n" 246 "movq %%mm4, %%mm1\n" 250 "movq %%mm4, %%mm5\n" 251 "movq %%mm1, %%mm3\n" 252 "punpcklbw %%mm7, %%mm4\n" 253 "punpcklbw %%mm7, %%mm1\n" 254 "punpckhbw %%mm7, %%mm5\n" 255 "punpckhbw %%mm7, %%mm3\n" 256 "psubw %%mm1, %%mm4\n" 257 "psubw %%mm3, %%mm5\n" 258 "psubw %%mm4, %%mm0\n" 259 "psubw %%mm5, %%mm2\n" 260 "pxor %%mm3, %%mm3\n" 261 "pxor %%mm1, %%mm1\n" 262 "pcmpgtw %%mm0, %%mm3\n\t" 263 "pcmpgtw %%mm2, %%mm1\n\t" 264 "pxor %%mm3, %%mm0\n" 265 "pxor %%mm1, %%mm2\n" 266 "psubw %%mm3, %%mm0\n" 267 "psubw %%mm1, %%mm2\n" 268 "paddw %%mm0, %%mm2\n" 269 "paddw %%mm2, %%mm6\n" 275 "movq %%mm6, %%mm0\n" 276 "punpcklwd %%mm7, %%mm0\n" 277 "punpckhwd %%mm7, %%mm6\n" 278 "paddd %%mm0, %%mm6\n" 280 "movq %%mm6, %%mm0\n" 282 "paddd %%mm6, %%mm0\n" 284 :
"+r" (pix1),
"=r" (tmp)
285 :
"r" ((
x86_reg) line_size),
"g" (h - 2)
291 static int hf_noise16_mmx(
uint8_t *pix1,
int line_size,
int h)
298 "pxor %%mm7, %%mm7\n" 299 "pxor %%mm6, %%mm6\n" 302 "movq 1(%0), %%mm1\n" 303 "movq %%mm0, %%mm2\n" 304 "movq %%mm1, %%mm3\n" 305 "punpcklbw %%mm7, %%mm0\n" 306 "punpcklbw %%mm7, %%mm1\n" 307 "punpckhbw %%mm7, %%mm2\n" 308 "punpckhbw %%mm7, %%mm3\n" 309 "psubw %%mm1, %%mm0\n" 310 "psubw %%mm3, %%mm2\n" 315 "movq 1(%0), %%mm1\n" 316 "movq %%mm4, %%mm5\n" 317 "movq %%mm1, %%mm3\n" 318 "punpcklbw %%mm7, %%mm4\n" 319 "punpcklbw %%mm7, %%mm1\n" 320 "punpckhbw %%mm7, %%mm5\n" 321 "punpckhbw %%mm7, %%mm3\n" 322 "psubw %%mm1, %%mm4\n" 323 "psubw %%mm3, %%mm5\n" 324 "psubw %%mm4, %%mm0\n" 325 "psubw %%mm5, %%mm2\n" 326 "pxor %%mm3, %%mm3\n" 327 "pxor %%mm1, %%mm1\n" 328 "pcmpgtw %%mm0, %%mm3\n\t" 329 "pcmpgtw %%mm2, %%mm1\n\t" 330 "pxor %%mm3, %%mm0\n" 331 "pxor %%mm1, %%mm2\n" 332 "psubw %%mm3, %%mm0\n" 333 "psubw %%mm1, %%mm2\n" 334 "paddw %%mm0, %%mm2\n" 335 "paddw %%mm2, %%mm6\n" 341 "movq 1(%0), %%mm1\n" 342 "movq %%mm0, %%mm2\n" 343 "movq %%mm1, %%mm3\n" 344 "punpcklbw %%mm7, %%mm0\n" 345 "punpcklbw %%mm7, %%mm1\n" 346 "punpckhbw %%mm7, %%mm2\n" 347 "punpckhbw %%mm7, %%mm3\n" 348 "psubw %%mm1, %%mm0\n" 349 "psubw %%mm3, %%mm2\n" 350 "psubw %%mm0, %%mm4\n" 351 "psubw %%mm2, %%mm5\n" 352 "pxor %%mm3, %%mm3\n" 353 "pxor %%mm1, %%mm1\n" 354 "pcmpgtw %%mm4, %%mm3\n\t" 355 "pcmpgtw %%mm5, %%mm1\n\t" 356 "pxor %%mm3, %%mm4\n" 357 "pxor %%mm1, %%mm5\n" 358 "psubw %%mm3, %%mm4\n" 359 "psubw %%mm1, %%mm5\n" 360 "paddw %%mm4, %%mm5\n" 361 "paddw %%mm5, %%mm6\n" 366 "movq 1(%0), %%mm1\n" 367 "movq %%mm4, %%mm5\n" 368 "movq %%mm1, %%mm3\n" 369 "punpcklbw %%mm7, %%mm4\n" 370 "punpcklbw %%mm7, %%mm1\n" 371 "punpckhbw %%mm7, %%mm5\n" 372 "punpckhbw %%mm7, %%mm3\n" 373 "psubw %%mm1, %%mm4\n" 374 "psubw %%mm3, %%mm5\n" 375 "psubw %%mm4, %%mm0\n" 376 "psubw %%mm5, %%mm2\n" 377 "pxor %%mm3, %%mm3\n" 378 "pxor %%mm1, %%mm1\n" 379 "pcmpgtw %%mm0, %%mm3\n\t" 380 "pcmpgtw %%mm2, %%mm1\n\t" 381 "pxor %%mm3, %%mm0\n" 382 "pxor %%mm1, %%mm2\n" 383 "psubw %%mm3, %%mm0\n" 384 "psubw %%mm1, %%mm2\n" 385 "paddw %%mm0, %%mm2\n" 386 "paddw %%mm2, %%mm6\n" 392 "movq %%mm6, %%mm0\n" 393 "punpcklwd %%mm7, %%mm0\n" 394 "punpckhwd %%mm7, %%mm6\n" 395 "paddd %%mm0, %%mm6\n" 397 "movq %%mm6, %%mm0\n" 399 "paddd %%mm6, %%mm0\n" 401 :
"+r" (pix1),
"=r" (tmp)
402 :
"r" ((
x86_reg) line_size),
"g" (h - 2)
405 return tmp + hf_noise8_mmx(pix + 8, line_size, h);
409 int line_size,
int h)
414 score1 = c->
mecc.
sse[0](c, pix1, pix2, line_size, h);
416 score1 = sse16_mmx(c, pix1, pix2, line_size, h);
417 score2 = hf_noise16_mmx(pix1, line_size, h) -
418 hf_noise16_mmx(pix2, line_size, h);
423 return score1 +
FFABS(score2) * 8;
427 int line_size,
int h)
429 int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
430 int score2 = hf_noise8_mmx(pix1, line_size, h) -
431 hf_noise8_mmx(pix2, line_size, h);
436 return score1 +
FFABS(score2) * 8;
440 int line_size,
int h)
444 assert((((
int) pix) & 7) == 0);
445 assert((line_size & 7) == 0);
447 #define SUM(in0, in1, out0, out1) \ 448 "movq (%0), %%mm2\n" \ 449 "movq 8(%0), %%mm3\n" \ 451 "movq %%mm2, " #out0 "\n" \ 452 "movq %%mm3, " #out1 "\n" \ 453 "psubusb " #in0 ", %%mm2\n" \ 454 "psubusb " #in1 ", %%mm3\n" \ 455 "psubusb " #out0 ", " #in0 "\n" \ 456 "psubusb " #out1 ", " #in1 "\n" \ 457 "por %%mm2, " #in0 "\n" \ 458 "por %%mm3, " #in1 "\n" \ 459 "movq " #in0 ", %%mm2\n" \ 460 "movq " #in1 ", %%mm3\n" \ 461 "punpcklbw %%mm7, " #in0 "\n" \ 462 "punpcklbw %%mm7, " #in1 "\n" \ 463 "punpckhbw %%mm7, %%mm2\n" \ 464 "punpckhbw %%mm7, %%mm3\n" \ 465 "paddw " #in1 ", " #in0 "\n" \ 466 "paddw %%mm3, %%mm2\n" \ 467 "paddw %%mm2, " #in0 "\n" \ 468 "paddw " #in0 ", %%mm6\n" 473 "pxor %%mm6, %%mm6\n" 474 "pxor %%mm7, %%mm7\n" 476 "movq 8(%0), %%mm1\n" 481 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
483 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
488 "movq %%mm6, %%mm0\n" 490 "paddw %%mm6, %%mm0\n" 491 "movq %%mm0, %%mm6\n" 493 "paddw %%mm6, %%mm0\n" 495 :
"+r" (pix),
"=r" (tmp)
496 :
"r" ((
x86_reg) line_size),
"m" (h)
504 int line_size,
int h)
508 assert((((
int) pix) & 7) == 0);
509 assert((line_size & 7) == 0);
511 #define SUM(in0, in1, out0, out1) \ 512 "movq (%0), " #out0 "\n" \ 513 "movq 8(%0), " #out1 "\n" \ 515 "psadbw " #out0 ", " #in0 "\n" \ 516 "psadbw " #out1 ", " #in1 "\n" \ 517 "paddw " #in1 ", " #in0 "\n" \ 518 "paddw " #in0 ", %%mm6\n" 522 "pxor %%mm6, %%mm6\n" 523 "pxor %%mm7, %%mm7\n" 525 "movq 8(%0), %%mm1\n" 530 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
532 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
538 :
"+r" (pix),
"=r" (tmp)
539 :
"r" ((
x86_reg) line_size),
"m" (h)
547 int line_size,
int h)
551 assert((((
int) pix1) & 7) == 0);
552 assert((((
int) pix2) & 7) == 0);
553 assert((line_size & 7) == 0);
555 #define SUM(in0, in1, out0, out1) \ 556 "movq (%0), %%mm2\n" \ 557 "movq (%1), " #out0 "\n" \ 558 "movq 8(%0), %%mm3\n" \ 559 "movq 8(%1), " #out1 "\n" \ 562 "psubb " #out0 ", %%mm2\n" \ 563 "psubb " #out1 ", %%mm3\n" \ 564 "pxor %%mm7, %%mm2\n" \ 565 "pxor %%mm7, %%mm3\n" \ 566 "movq %%mm2, " #out0 "\n" \ 567 "movq %%mm3, " #out1 "\n" \ 568 "psubusb " #in0 ", %%mm2\n" \ 569 "psubusb " #in1 ", %%mm3\n" \ 570 "psubusb " #out0 ", " #in0 "\n" \ 571 "psubusb " #out1 ", " #in1 "\n" \ 572 "por %%mm2, " #in0 "\n" \ 573 "por %%mm3, " #in1 "\n" \ 574 "movq " #in0 ", %%mm2\n" \ 575 "movq " #in1 ", %%mm3\n" \ 576 "punpcklbw %%mm7, " #in0 "\n" \ 577 "punpcklbw %%mm7, " #in1 "\n" \ 578 "punpckhbw %%mm7, %%mm2\n" \ 579 "punpckhbw %%mm7, %%mm3\n" \ 580 "paddw " #in1 ", " #in0 "\n" \ 581 "paddw %%mm3, %%mm2\n" \ 582 "paddw %%mm2, " #in0 "\n" \ 583 "paddw " #in0 ", %%mm6\n" 588 "pxor %%mm6, %%mm6\n" 589 "pcmpeqw %%mm7, %%mm7\n" 591 "packsswb %%mm7, %%mm7\n" 594 "movq 8(%0), %%mm1\n" 595 "movq 8(%1), %%mm3\n" 598 "psubb %%mm2, %%mm0\n" 599 "psubb %%mm3, %%mm1\n" 600 "pxor %%mm7, %%mm0\n" 601 "pxor %%mm7, %%mm1\n" 605 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
607 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
612 "movq %%mm6, %%mm0\n" 614 "paddw %%mm6, %%mm0\n" 615 "movq %%mm0, %%mm6\n" 617 "paddw %%mm6, %%mm0\n" 619 :
"+r" (pix1),
"+r" (pix2),
"=r" (tmp)
620 :
"r" ((
x86_reg) line_size),
"m" (h)
628 int line_size,
int h)
632 assert((((
int) pix1) & 7) == 0);
633 assert((((
int) pix2) & 7) == 0);
634 assert((line_size & 7) == 0);
636 #define SUM(in0, in1, out0, out1) \ 637 "movq (%0), " #out0 "\n" \ 638 "movq (%1), %%mm2\n" \ 639 "movq 8(%0), " #out1 "\n" \ 640 "movq 8(%1), %%mm3\n" \ 643 "psubb %%mm2, " #out0 "\n" \ 644 "psubb %%mm3, " #out1 "\n" \ 645 "pxor %%mm7, " #out0 "\n" \ 646 "pxor %%mm7, " #out1 "\n" \ 647 "psadbw " #out0 ", " #in0 "\n" \ 648 "psadbw " #out1 ", " #in1 "\n" \ 649 "paddw " #in1 ", " #in0 "\n" \ 650 "paddw " #in0 ", %%mm6\n " 654 "pxor %%mm6, %%mm6\n" 655 "pcmpeqw %%mm7, %%mm7\n" 657 "packsswb %%mm7, %%mm7\n" 660 "movq 8(%0), %%mm1\n" 661 "movq 8(%1), %%mm3\n" 664 "psubb %%mm2, %%mm0\n" 665 "psubb %%mm3, %%mm1\n" 666 "pxor %%mm7, %%mm0\n" 667 "pxor %%mm7, %%mm1\n" 671 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
673 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
679 :
"+r" (pix1),
"+r" (pix2),
"=r" (tmp)
680 :
"r" ((
x86_reg) line_size),
"m" (h)
687 #define MMABS_MMX(a,z) \ 688 "pxor " #z ", " #z " \n\t" \ 689 "pcmpgtw " #a ", " #z " \n\t" \ 690 "pxor " #z ", " #a " \n\t" \ 691 "psubw " #z ", " #a " \n\t" 693 #define MMABS_MMXEXT(a, z) \ 694 "pxor " #z ", " #z " \n\t" \ 695 "psubw " #a ", " #z " \n\t" \ 696 "pmaxsw " #z ", " #a " \n\t" 698 #define MMABS_SSSE3(a,z) \ 699 "pabsw " #a ", " #a " \n\t" 701 #define MMABS_SUM(a,z, sum) \ 703 "paddusw " #a ", " #sum " \n\t" 709 #define HSUM_MMX(a, t, dst) \ 710 "movq " #a ", " #t " \n\t" \ 711 "psrlq $32, " #a " \n\t" \ 712 "paddusw " #t ", " #a " \n\t" \ 713 "movq " #a ", " #t " \n\t" \ 714 "psrlq $16, " #a " \n\t" \ 715 "paddusw " #t ", " #a " \n\t" \ 716 "movd " #a ", " #dst " \n\t" \ 718 #define HSUM_MMXEXT(a, t, dst) \ 719 "pshufw $0x0E, " #a ", " #t " \n\t" \ 720 "paddusw " #t ", " #a " \n\t" \ 721 "pshufw $0x01, " #a ", " #t " \n\t" \ 722 "paddusw " #t ", " #a " \n\t" \ 723 "movd " #a ", " #dst " \n\t" \ 725 #define HSUM_SSE2(a, t, dst) \ 726 "movhlps " #a ", " #t " \n\t" \ 727 "paddusw " #t ", " #a " \n\t" \ 728 "pshuflw $0x0E, " #a ", " #t " \n\t" \ 729 "paddusw " #t ", " #a " \n\t" \ 730 "pshuflw $0x01, " #a ", " #t " \n\t" \ 731 "paddusw " #t ", " #a " \n\t" \ 732 "movd " #a ", " #dst " \n\t" \ 734 #define DCT_SAD4(m, mm, o) \ 735 "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ 736 "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ 737 "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ 738 "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ 739 MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ 740 MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ 741 MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ 742 MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ 744 #define DCT_SAD_MMX \ 745 "pxor %%mm0, %%mm0 \n\t" \ 746 "pxor %%mm1, %%mm1 \n\t" \ 747 DCT_SAD4(q, %%mm, 0) \ 748 DCT_SAD4(q, %%mm, 8) \ 749 DCT_SAD4(q, %%mm, 64) \ 750 DCT_SAD4(q, %%mm, 72) \ 751 "paddusw %%mm1, %%mm0 \n\t" \ 752 HSUM(%%mm0, %%mm1, %0) 754 #define DCT_SAD_SSE2 \ 755 "pxor %%xmm0, %%xmm0 \n\t" \ 756 "pxor %%xmm1, %%xmm1 \n\t" \ 757 DCT_SAD4(dqa, %%xmm, 0) \ 758 DCT_SAD4(dqa, %%xmm, 64) \ 759 "paddusw %%xmm1, %%xmm0 \n\t" \ 760 HSUM(%%xmm0, %%xmm1, %0) 762 #define DCT_SAD_FUNC(cpu) \ 763 static int sum_abs_dctelem_ ## cpu(int16_t *block) \ 770 return sum & 0xFFFF; \ 773 #define DCT_SAD DCT_SAD_MMX 774 #define HSUM(a, t, dst) HSUM_MMX(a, t, dst) 775 #define MMABS(a, z) MMABS_MMX(a, z) 780 #define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) 781 #define MMABS(a, z) MMABS_MMXEXT(a, z) 786 #define DCT_SAD DCT_SAD_SSE2 787 #define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) 791 #if HAVE_SSSE3_INLINE 792 #define MMABS(a, z) MMABS_SSSE3(a, z) 801 0x0000000000000000ULL,
802 0x0001000100010001ULL,
803 0x0002000200020002ULL,
814 "movq (%1, %%"REG_a
"), %%mm0 \n\t" 815 "movq (%2, %%"REG_a
"), %%mm2 \n\t" 816 "movq (%2, %%"REG_a
"), %%mm4 \n\t" 817 "add %3, %%"REG_a
" \n\t" 818 "psubusb %%mm0, %%mm2 \n\t" 819 "psubusb %%mm4, %%mm0 \n\t" 820 "movq (%1, %%"REG_a
"), %%mm1 \n\t" 821 "movq (%2, %%"REG_a
"), %%mm3 \n\t" 822 "movq (%2, %%"REG_a
"), %%mm5 \n\t" 823 "psubusb %%mm1, %%mm3 \n\t" 824 "psubusb %%mm5, %%mm1 \n\t" 825 "por %%mm2, %%mm0 \n\t" 826 "por %%mm1, %%mm3 \n\t" 827 "movq %%mm0, %%mm1 \n\t" 828 "movq %%mm3, %%mm2 \n\t" 829 "punpcklbw %%mm7, %%mm0 \n\t" 830 "punpckhbw %%mm7, %%mm1 \n\t" 831 "punpcklbw %%mm7, %%mm3 \n\t" 832 "punpckhbw %%mm7, %%mm2 \n\t" 833 "paddw %%mm1, %%mm0 \n\t" 834 "paddw %%mm3, %%mm2 \n\t" 835 "paddw %%mm2, %%mm0 \n\t" 836 "paddw %%mm0, %%mm6 \n\t" 837 "add %3, %%"REG_a
" \n\t" 849 "movq (%1), %%mm0 \n\t" 850 "movq (%1, %3), %%mm1 \n\t" 851 "psadbw (%2), %%mm0 \n\t" 852 "psadbw (%2, %3), %%mm1 \n\t" 853 "paddw %%mm0, %%mm6 \n\t" 854 "paddw %%mm1, %%mm6 \n\t" 855 "lea (%1,%3,2), %1 \n\t" 856 "lea (%2,%3,2), %2 \n\t" 859 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
868 "pxor %%xmm2, %%xmm2 \n\t" 871 "movdqu (%1), %%xmm0 \n\t" 872 "movdqu (%1, %4), %%xmm1 \n\t" 873 "psadbw (%2), %%xmm0 \n\t" 874 "psadbw (%2, %4), %%xmm1 \n\t" 875 "paddw %%xmm0, %%xmm2 \n\t" 876 "paddw %%xmm1, %%xmm2 \n\t" 877 "lea (%1,%4,2), %1 \n\t" 878 "lea (%2,%4,2), %2 \n\t" 881 "movhlps %%xmm2, %%xmm0 \n\t" 882 "paddw %%xmm0, %%xmm2 \n\t" 883 "movd %%xmm2, %3 \n\t" 884 :
"+r" (h),
"+r" (blk1),
"+r" (blk2),
"=r" (ret)
895 "movq (%1), %%mm0 \n\t" 896 "movq (%1, %3), %%mm1 \n\t" 897 "pavgb 1(%1), %%mm0 \n\t" 898 "pavgb 1(%1, %3), %%mm1 \n\t" 899 "psadbw (%2), %%mm0 \n\t" 900 "psadbw (%2, %3), %%mm1 \n\t" 901 "paddw %%mm0, %%mm6 \n\t" 902 "paddw %%mm1, %%mm6 \n\t" 903 "lea (%1,%3,2), %1 \n\t" 904 "lea (%2,%3,2), %2 \n\t" 907 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
915 "movq (%1), %%mm0 \n\t" 919 "movq (%1), %%mm1 \n\t" 920 "movq (%1, %3), %%mm2 \n\t" 921 "pavgb %%mm1, %%mm0 \n\t" 922 "pavgb %%mm2, %%mm1 \n\t" 923 "psadbw (%2), %%mm0 \n\t" 924 "psadbw (%2, %3), %%mm1 \n\t" 925 "paddw %%mm0, %%mm6 \n\t" 926 "paddw %%mm1, %%mm6 \n\t" 927 "movq %%mm2, %%mm0 \n\t" 928 "lea (%1,%3,2), %1 \n\t" 929 "lea (%2,%3,2), %2 \n\t" 932 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
940 "movq "MANGLE(bone)
", %%mm5 \n\t" 941 "movq (%1), %%mm0 \n\t" 942 "pavgb 1(%1), %%mm0 \n\t" 946 "movq (%1), %%mm1 \n\t" 947 "movq (%1,%3), %%mm2 \n\t" 948 "pavgb 1(%1), %%mm1 \n\t" 949 "pavgb 1(%1,%3), %%mm2 \n\t" 950 "psubusb %%mm5, %%mm1 \n\t" 951 "pavgb %%mm1, %%mm0 \n\t" 952 "pavgb %%mm2, %%mm1 \n\t" 953 "psadbw (%2), %%mm0 \n\t" 954 "psadbw (%2,%3), %%mm1 \n\t" 955 "paddw %%mm0, %%mm6 \n\t" 956 "paddw %%mm1, %%mm6 \n\t" 957 "movq %%mm2, %%mm0 \n\t" 958 "lea (%1,%3,2), %1 \n\t" 959 "lea (%2,%3,2), %2 \n\t" 962 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
973 "movq (%1, %%"REG_a
"), %%mm0 \n\t" 974 "movq (%2, %%"REG_a
"), %%mm1 \n\t" 975 "movq (%1, %%"REG_a
"), %%mm2 \n\t" 976 "movq (%2, %%"REG_a
"), %%mm3 \n\t" 977 "punpcklbw %%mm7, %%mm0 \n\t" 978 "punpcklbw %%mm7, %%mm1 \n\t" 979 "punpckhbw %%mm7, %%mm2 \n\t" 980 "punpckhbw %%mm7, %%mm3 \n\t" 981 "paddw %%mm0, %%mm1 \n\t" 982 "paddw %%mm2, %%mm3 \n\t" 983 "movq (%3, %%"REG_a
"), %%mm4 \n\t" 984 "movq (%3, %%"REG_a
"), %%mm2 \n\t" 985 "paddw %%mm5, %%mm1 \n\t" 986 "paddw %%mm5, %%mm3 \n\t" 987 "psrlw $1, %%mm1 \n\t" 988 "psrlw $1, %%mm3 \n\t" 989 "packuswb %%mm3, %%mm1 \n\t" 990 "psubusb %%mm1, %%mm4 \n\t" 991 "psubusb %%mm2, %%mm1 \n\t" 992 "por %%mm4, %%mm1 \n\t" 993 "movq %%mm1, %%mm0 \n\t" 994 "punpcklbw %%mm7, %%mm0 \n\t" 995 "punpckhbw %%mm7, %%mm1 \n\t" 996 "paddw %%mm1, %%mm0 \n\t" 997 "paddw %%mm0, %%mm6 \n\t" 998 "add %4, %%"REG_a
" \n\t" 1001 :
"r" (blk1a - len),
"r" (blk1b -
len),
"r" (blk2 - len),
1005 static inline void sad8_4_mmx(
uint8_t *blk1,
uint8_t *blk2,
int stride,
int h)
1009 "movq (%1, %%"REG_a
"), %%mm0 \n\t" 1010 "movq 1(%1, %%"REG_a
"), %%mm2 \n\t" 1011 "movq %%mm0, %%mm1 \n\t" 1012 "movq %%mm2, %%mm3 \n\t" 1013 "punpcklbw %%mm7, %%mm0 \n\t" 1014 "punpckhbw %%mm7, %%mm1 \n\t" 1015 "punpcklbw %%mm7, %%mm2 \n\t" 1016 "punpckhbw %%mm7, %%mm3 \n\t" 1017 "paddw %%mm2, %%mm0 \n\t" 1018 "paddw %%mm3, %%mm1 \n\t" 1021 "movq (%2, %%"REG_a
"), %%mm2 \n\t" 1022 "movq 1(%2, %%"REG_a
"), %%mm4 \n\t" 1023 "movq %%mm2, %%mm3 \n\t" 1024 "movq %%mm4, %%mm5 \n\t" 1025 "punpcklbw %%mm7, %%mm2 \n\t" 1026 "punpckhbw %%mm7, %%mm3 \n\t" 1027 "punpcklbw %%mm7, %%mm4 \n\t" 1028 "punpckhbw %%mm7, %%mm5 \n\t" 1029 "paddw %%mm4, %%mm2 \n\t" 1030 "paddw %%mm5, %%mm3 \n\t" 1031 "movq 16+"MANGLE(round_tab)
", %%mm5 \n\t" 1032 "paddw %%mm2, %%mm0 \n\t" 1033 "paddw %%mm3, %%mm1 \n\t" 1034 "paddw %%mm5, %%mm0 \n\t" 1035 "paddw %%mm5, %%mm1 \n\t" 1036 "movq (%3, %%"REG_a
"), %%mm4 \n\t" 1037 "movq (%3, %%"REG_a
"), %%mm5 \n\t" 1038 "psrlw $2, %%mm0 \n\t" 1039 "psrlw $2, %%mm1 \n\t" 1040 "packuswb %%mm1, %%mm0 \n\t" 1041 "psubusb %%mm0, %%mm4 \n\t" 1042 "psubusb %%mm5, %%mm0 \n\t" 1043 "por %%mm4, %%mm0 \n\t" 1044 "movq %%mm0, %%mm4 \n\t" 1045 "punpcklbw %%mm7, %%mm0 \n\t" 1046 "punpckhbw %%mm7, %%mm4 \n\t" 1047 "paddw %%mm0, %%mm6 \n\t" 1048 "paddw %%mm4, %%mm6 \n\t" 1049 "movq %%mm2, %%mm0 \n\t" 1050 "movq %%mm3, %%mm1 \n\t" 1051 "add %4, %%"REG_a
" \n\t" 1054 :
"r" (blk1 - len),
"r" (blk1 - len +
stride),
"r" (blk2 - len),
1058 static inline int sum_mmx(
void)
1062 "movq %%mm6, %%mm0 \n\t" 1063 "psrlq $32, %%mm6 \n\t" 1064 "paddw %%mm0, %%mm6 \n\t" 1065 "movq %%mm6, %%mm0 \n\t" 1066 "psrlq $16, %%mm6 \n\t" 1067 "paddw %%mm0, %%mm6 \n\t" 1068 "movd %%mm6, %0 \n\t" 1070 return ret & 0xFFFF;
1073 static inline int sum_mmxext(
void)
1077 "movd %%mm6, %0 \n\t" 1082 static inline void sad8_x2a_mmx(
uint8_t *blk1,
uint8_t *blk2,
int stride,
int h)
1084 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
1087 static inline void sad8_y2a_mmx(
uint8_t *blk1,
uint8_t *blk2,
int stride,
int h)
1089 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
1092 #define PIX_SAD(suf) \ 1093 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1094 uint8_t *blk1, int stride, int h) \ 1097 __asm__ volatile ( \ 1098 "pxor %%mm7, %%mm7 \n\t" \ 1099 "pxor %%mm6, %%mm6 \n\t" \ 1102 sad8_1_ ## suf(blk1, blk2, stride, 8); \ 1104 return sum_ ## suf(); \ 1107 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1108 uint8_t *blk1, int stride, int h) \ 1111 __asm__ volatile ( \ 1112 "pxor %%mm7, %%mm7 \n\t" \ 1113 "pxor %%mm6, %%mm6 \n\t" \ 1114 "movq %0, %%mm5 \n\t" \ 1115 :: "m" (round_tab[1])); \ 1117 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ 1119 return sum_ ## suf(); \ 1122 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1123 uint8_t *blk1, int stride, int h) \ 1126 __asm__ volatile ( \ 1127 "pxor %%mm7, %%mm7 \n\t" \ 1128 "pxor %%mm6, %%mm6 \n\t" \ 1129 "movq %0, %%mm5 \n\t" \ 1130 :: "m" (round_tab[1])); \ 1132 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ 1134 return sum_ ## suf(); \ 1137 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1138 uint8_t *blk1, int stride, int h) \ 1141 __asm__ volatile ( \ 1142 "pxor %%mm7, %%mm7 \n\t" \ 1143 "pxor %%mm6, %%mm6 \n\t" \ 1146 sad8_4_ ## suf(blk1, blk2, stride, 8); \ 1148 return sum_ ## suf(); \ 1151 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1152 uint8_t *blk1, int stride, int h) \ 1154 __asm__ volatile ( \ 1155 "pxor %%mm7, %%mm7 \n\t" \ 1156 "pxor %%mm6, %%mm6 \n\t" \ 1159 sad8_1_ ## suf(blk1, blk2, stride, h); \ 1160 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 1162 return sum_ ## suf(); \ 1165 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1166 uint8_t *blk1, int stride, int h) \ 1168 __asm__ volatile ( \ 1169 "pxor %%mm7, %%mm7 \n\t" \ 1170 "pxor %%mm6, %%mm6 \n\t" \ 1171 "movq %0, %%mm5 \n\t" \ 1172 :: "m" (round_tab[1])); \ 1174 sad8_x2a_ ## suf(blk1, blk2, stride, h); \ 1175 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 1177 return sum_ ## suf(); \ 1180 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1181 uint8_t *blk1, int stride, int h) \ 1183 __asm__ volatile ( \ 1184 "pxor %%mm7, %%mm7 \n\t" \ 1185 "pxor %%mm6, %%mm6 \n\t" \ 1186 "movq %0, %%mm5 \n\t" \ 1187 :: "m" (round_tab[1])); \ 1189 sad8_y2a_ ## suf(blk1, blk2, stride, h); \ 1190 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 1192 return sum_ ## suf(); \ 1195 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 1196 uint8_t *blk1, int stride, int h) \ 1198 __asm__ volatile ( \ 1199 "pxor %%mm7, %%mm7 \n\t" \ 1200 "pxor %%mm6, %%mm6 \n\t" \ 1203 sad8_4_ ## suf(blk1, blk2, stride, h); \ 1204 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 1206 return sum_ ## suf(); \ 1215 int line_size,
int h);
1217 #define hadamard_func(cpu) \ 1218 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 1219 uint8_t *src2, int stride, int h); \ 1220 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 1221 uint8_t *src2, int stride, int h); 1234 c->sum_abs_dctelem = sum_abs_dctelem_mmx;
1236 c->pix_abs[0][0] = sad16_mmx;
1237 c->pix_abs[0][1] = sad16_x2_mmx;
1238 c->pix_abs[0][2] = sad16_y2_mmx;
1239 c->pix_abs[0][3] = sad16_xy2_mmx;
1240 c->pix_abs[1][0] = sad8_mmx;
1241 c->pix_abs[1][1] = sad8_x2_mmx;
1242 c->pix_abs[1][2] = sad8_y2_mmx;
1243 c->pix_abs[1][3] = sad8_xy2_mmx;
1245 c->sad[0] = sad16_mmx;
1246 c->sad[1] = sad8_mmx;
1248 c->sse[0] = sse16_mmx;
1249 c->sse[1] = sse8_mmx;
1250 c->vsad[4] = vsad_intra16_mmx;
1252 c->nsse[0] = nsse16_mmx;
1253 c->nsse[1] = nsse8_mmx;
1256 c->vsad[0] = vsad16_mmx;
1261 c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
1263 c->vsad[4] = vsad_intra16_mmxext;
1265 c->pix_abs[0][0] = sad16_mmxext;
1266 c->pix_abs[1][0] = sad8_mmxext;
1268 c->sad[0] = sad16_mmxext;
1269 c->sad[1] = sad8_mmxext;
1272 c->pix_abs[0][1] = sad16_x2_mmxext;
1273 c->pix_abs[0][2] = sad16_y2_mmxext;
1274 c->pix_abs[0][3] = sad16_xy2_mmxext;
1275 c->pix_abs[1][1] = sad8_x2_mmxext;
1276 c->pix_abs[1][2] = sad8_y2_mmxext;
1277 c->pix_abs[1][3] = sad8_xy2_mmxext;
1279 c->vsad[0] = vsad16_mmxext;
1284 c->sum_abs_dctelem = sum_abs_dctelem_sse2;
1288 c->sad[0] = sad16_sse2;
1291 #if HAVE_SSSE3_INLINE 1293 c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
1299 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
1300 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
1304 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
1305 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
1311 #if HAVE_ALIGNED_STACK 1312 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
1313 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
1318 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
1319 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
#define EXTERNAL_MMX(flags)
#define INLINE_SSE2(flags)
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Macro definitions for various function/variable attributes.
#define DECLARE_ASM_CONST(n, t, v)
#define hadamard_func(cpu)
#define CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
#define EXTERNAL_SSE2(flags)
#define INLINE_MMX(flags)
#define INLINE_SSSE3(flags)
#define AV_CPU_FLAG_3DNOW
AMD 3DNOW.
main external API structure.
#define EXTERNAL_SSSE3(flags)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
#define EXTERNAL_MMXEXT(flags)
struct AVCodecContext * avctx
#define INLINE_MMXEXT(flags)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)