Libav
vc1dsp_mmx.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions MMX-optimized
3  * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
4  *
5  * Permission is hereby granted, free of charge, to any person
6  * obtaining a copy of this software and associated documentation
7  * files (the "Software"), to deal in the Software without
8  * restriction, including without limitation the rights to use,
9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following
12  * conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24  * OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "libavutil/cpu.h"
28 #include "libavutil/internal.h"
29 #include "libavutil/mem.h"
30 #include "libavutil/x86/asm.h"
31 #include "libavutil/x86/cpu.h"
32 #include "libavcodec/vc1dsp.h"
33 #include "constants.h"
34 #include "fpel.h"
35 #include "vc1dsp.h"
36 
37 #if HAVE_INLINE_ASM
38 
39 #define OP_PUT(S,D)
40 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
41 
43 #define NORMALIZE_MMX(SHIFT) \
44  "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
45  "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
46  "psraw "SHIFT", %%mm3 \n\t" \
47  "psraw "SHIFT", %%mm4 \n\t"
48 
49 #define TRANSFER_DO_PACK(OP) \
50  "packuswb %%mm4, %%mm3 \n\t" \
51  OP((%2), %%mm3) \
52  "movq %%mm3, (%2) \n\t"
53 
54 #define TRANSFER_DONT_PACK(OP) \
55  OP(0(%2), %%mm3) \
56  OP(8(%2), %%mm4) \
57  "movq %%mm3, 0(%2) \n\t" \
58  "movq %%mm4, 8(%2) \n\t"
59 
61 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
62 #define DONT_UNPACK(reg)
63 
65 #define LOAD_ROUNDER_MMX(ROUND) \
66  "movd "ROUND", %%mm7 \n\t" \
67  "punpcklwd %%mm7, %%mm7 \n\t" \
68  "punpckldq %%mm7, %%mm7 \n\t"
69 
70 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
71  "paddw %%mm"#R2", %%mm"#R1" \n\t" \
72  "movd (%0,%3), %%mm"#R0" \n\t" \
73  "pmullw %%mm6, %%mm"#R1" \n\t" \
74  "punpcklbw %%mm0, %%mm"#R0" \n\t" \
75  "movd (%0,%2), %%mm"#R3" \n\t" \
76  "psubw %%mm"#R0", %%mm"#R1" \n\t" \
77  "punpcklbw %%mm0, %%mm"#R3" \n\t" \
78  "paddw %%mm7, %%mm"#R1" \n\t" \
79  "psubw %%mm"#R3", %%mm"#R1" \n\t" \
80  "psraw %4, %%mm"#R1" \n\t" \
81  "movq %%mm"#R1", "#OFF"(%1) \n\t" \
82  "add %2, %0 \n\t"
83 
85 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
86  const uint8_t *src, x86_reg stride,
87  int rnd, int64_t shift)
88 {
89  __asm__ volatile(
90  "mov $3, %%"REG_c" \n\t"
91  LOAD_ROUNDER_MMX("%5")
92  "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
93  "1: \n\t"
94  "movd (%0), %%mm2 \n\t"
95  "add %2, %0 \n\t"
96  "movd (%0), %%mm3 \n\t"
97  "punpcklbw %%mm0, %%mm2 \n\t"
98  "punpcklbw %%mm0, %%mm3 \n\t"
99  SHIFT2_LINE( 0, 1, 2, 3, 4)
100  SHIFT2_LINE( 24, 2, 3, 4, 1)
101  SHIFT2_LINE( 48, 3, 4, 1, 2)
102  SHIFT2_LINE( 72, 4, 1, 2, 3)
103  SHIFT2_LINE( 96, 1, 2, 3, 4)
104  SHIFT2_LINE(120, 2, 3, 4, 1)
105  SHIFT2_LINE(144, 3, 4, 1, 2)
106  SHIFT2_LINE(168, 4, 1, 2, 3)
107  "sub %6, %0 \n\t"
108  "add $8, %1 \n\t"
109  "dec %%"REG_c" \n\t"
110  "jnz 1b \n\t"
111  : "+r"(src), "+r"(dst)
112  : "r"(stride), "r"(-2*stride),
113  "m"(shift), "m"(rnd), "r"(9*stride-4)
114  : "%"REG_c, "memory"
115  );
116 }
117 
122 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
123 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
124  const int16_t *src, int rnd)\
125 {\
126  int h = 8;\
127 \
128  src -= 1;\
129  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
130  __asm__ volatile(\
131  LOAD_ROUNDER_MMX("%4")\
132  "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
133  "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
134  "1: \n\t"\
135  "movq 2*0+0(%1), %%mm1 \n\t"\
136  "movq 2*0+8(%1), %%mm2 \n\t"\
137  "movq 2*1+0(%1), %%mm3 \n\t"\
138  "movq 2*1+8(%1), %%mm4 \n\t"\
139  "paddw 2*3+0(%1), %%mm1 \n\t"\
140  "paddw 2*3+8(%1), %%mm2 \n\t"\
141  "paddw 2*2+0(%1), %%mm3 \n\t"\
142  "paddw 2*2+8(%1), %%mm4 \n\t"\
143  "pmullw %%mm5, %%mm3 \n\t"\
144  "pmullw %%mm5, %%mm4 \n\t"\
145  "psubw %%mm1, %%mm3 \n\t"\
146  "psubw %%mm2, %%mm4 \n\t"\
147  NORMALIZE_MMX("$7")\
148  /* Remove bias */\
149  "paddw %%mm6, %%mm3 \n\t"\
150  "paddw %%mm6, %%mm4 \n\t"\
151  TRANSFER_DO_PACK(OP)\
152  "add $24, %1 \n\t"\
153  "add %3, %2 \n\t"\
154  "decl %0 \n\t"\
155  "jnz 1b \n\t"\
156  : "+r"(h), "+r" (src), "+r" (dst)\
157  : "r"(stride), "m"(rnd)\
158  : "memory"\
159  );\
160 }
161 
162 VC1_HOR_16b_SHIFT2(OP_PUT, put_)
163 VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
164 
165 
170 #define VC1_SHIFT2(OP, OPNAME)\
171 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
172  x86_reg stride, int rnd, x86_reg offset)\
173 {\
174  rnd = 8-rnd;\
175  __asm__ volatile(\
176  "mov $8, %%"REG_c" \n\t"\
177  LOAD_ROUNDER_MMX("%5")\
178  "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
179  "1: \n\t"\
180  "movd 0(%0 ), %%mm3 \n\t"\
181  "movd 4(%0 ), %%mm4 \n\t"\
182  "movd 0(%0,%2), %%mm1 \n\t"\
183  "movd 4(%0,%2), %%mm2 \n\t"\
184  "add %2, %0 \n\t"\
185  "punpcklbw %%mm0, %%mm3 \n\t"\
186  "punpcklbw %%mm0, %%mm4 \n\t"\
187  "punpcklbw %%mm0, %%mm1 \n\t"\
188  "punpcklbw %%mm0, %%mm2 \n\t"\
189  "paddw %%mm1, %%mm3 \n\t"\
190  "paddw %%mm2, %%mm4 \n\t"\
191  "movd 0(%0,%3), %%mm1 \n\t"\
192  "movd 4(%0,%3), %%mm2 \n\t"\
193  "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
194  "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
195  "punpcklbw %%mm0, %%mm1 \n\t"\
196  "punpcklbw %%mm0, %%mm2 \n\t"\
197  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
198  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
199  "movd 0(%0,%2), %%mm1 \n\t"\
200  "movd 4(%0,%2), %%mm2 \n\t"\
201  "punpcklbw %%mm0, %%mm1 \n\t"\
202  "punpcklbw %%mm0, %%mm2 \n\t"\
203  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
204  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
205  NORMALIZE_MMX("$4")\
206  "packuswb %%mm4, %%mm3 \n\t"\
207  OP((%1), %%mm3)\
208  "movq %%mm3, (%1) \n\t"\
209  "add %6, %0 \n\t"\
210  "add %4, %1 \n\t"\
211  "dec %%"REG_c" \n\t"\
212  "jnz 1b \n\t"\
213  : "+r"(src), "+r"(dst)\
214  : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
215  "g"(stride-offset)\
216  : "%"REG_c, "memory"\
217  );\
218 }
219 
220 VC1_SHIFT2(OP_PUT, put_)
221 VC1_SHIFT2(OP_AVG, avg_)
222 
233 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
234  MOVQ "*0+"A1", %%mm1 \n\t" \
235  MOVQ "*4+"A1", %%mm2 \n\t" \
236  UNPACK("%%mm1") \
237  UNPACK("%%mm2") \
238  "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
239  "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
240  MOVQ "*0+"A2", %%mm3 \n\t" \
241  MOVQ "*4+"A2", %%mm4 \n\t" \
242  UNPACK("%%mm3") \
243  UNPACK("%%mm4") \
244  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
245  "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
246  "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
247  "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
248  MOVQ "*0+"A4", %%mm1 \n\t" \
249  MOVQ "*4+"A4", %%mm2 \n\t" \
250  UNPACK("%%mm1") \
251  UNPACK("%%mm2") \
252  "psllw $2, %%mm1 \n\t" /* 4* */ \
253  "psllw $2, %%mm2 \n\t" /* 4* */ \
254  "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
255  "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
256  MOVQ "*0+"A3", %%mm1 \n\t" \
257  MOVQ "*4+"A3", %%mm2 \n\t" \
258  UNPACK("%%mm1") \
259  UNPACK("%%mm2") \
260  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
261  "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
262  "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
263  "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
264 
273 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
274 static void \
275 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
276  x86_reg src_stride, \
277  int rnd, int64_t shift) \
278 { \
279  int h = 8; \
280  src -= src_stride; \
281  __asm__ volatile( \
282  LOAD_ROUNDER_MMX("%5") \
283  "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
284  "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
285  ".p2align 3 \n\t" \
286  "1: \n\t" \
287  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
288  NORMALIZE_MMX("%6") \
289  TRANSFER_DONT_PACK(OP_PUT) \
290  /* Last 3 (in fact 4) bytes on the line */ \
291  "movd 8+"A1", %%mm1 \n\t" \
292  DO_UNPACK("%%mm1") \
293  "movq %%mm1, %%mm3 \n\t" \
294  "paddw %%mm1, %%mm1 \n\t" \
295  "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
296  "movd 8+"A2", %%mm3 \n\t" \
297  DO_UNPACK("%%mm3") \
298  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
299  "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
300  "movd 8+"A3", %%mm1 \n\t" \
301  DO_UNPACK("%%mm1") \
302  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
303  "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
304  "movd 8+"A4", %%mm1 \n\t" \
305  DO_UNPACK("%%mm1") \
306  "psllw $2, %%mm1 \n\t" /* 4* */ \
307  "psubw %%mm1, %%mm3 \n\t" \
308  "paddw %%mm7, %%mm3 \n\t" \
309  "psraw %6, %%mm3 \n\t" \
310  "movq %%mm3, 16(%2) \n\t" \
311  "add %3, %1 \n\t" \
312  "add $24, %2 \n\t" \
313  "decl %0 \n\t" \
314  "jnz 1b \n\t" \
315  : "+r"(h), "+r" (src), "+r" (dst) \
316  : "r"(src_stride), "r"(3*src_stride), \
317  "m"(rnd), "m"(shift) \
318  : "memory" \
319  ); \
320 }
321 
329 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
330 static void \
331 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
332  const int16_t *src, int rnd) \
333 { \
334  int h = 8; \
335  src -= 1; \
336  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
337  __asm__ volatile( \
338  LOAD_ROUNDER_MMX("%4") \
339  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
340  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
341  ".p2align 3 \n\t" \
342  "1: \n\t" \
343  MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
344  NORMALIZE_MMX("$7") \
345  /* Remove bias */ \
346  "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
347  "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
348  TRANSFER_DO_PACK(OP) \
349  "add $24, %1 \n\t" \
350  "add %3, %2 \n\t" \
351  "decl %0 \n\t" \
352  "jnz 1b \n\t" \
353  : "+r"(h), "+r" (src), "+r" (dst) \
354  : "r"(stride), "m"(rnd) \
355  : "memory" \
356  ); \
357 }
358 
367 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
368 static void \
369 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
370  x86_reg stride, int rnd, x86_reg offset) \
371 { \
372  int h = 8; \
373  src -= offset; \
374  rnd = 32-rnd; \
375  __asm__ volatile ( \
376  LOAD_ROUNDER_MMX("%6") \
377  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
378  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
379  ".p2align 3 \n\t" \
380  "1: \n\t" \
381  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
382  NORMALIZE_MMX("$6") \
383  TRANSFER_DO_PACK(OP) \
384  "add %5, %1 \n\t" \
385  "add %5, %2 \n\t" \
386  "decl %0 \n\t" \
387  "jnz 1b \n\t" \
388  : "+r"(h), "+r" (src), "+r" (dst) \
389  : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
390  : "memory" \
391  ); \
392 }
393 
395 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
396 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
397 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
398 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
399 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
400 
402 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
403 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
404 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
405 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
406 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
407 
408 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
409 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
410 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
411 
423 #define VC1_MSPEL_MC(OP)\
424 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
425  int hmode, int vmode, int rnd)\
426 {\
427  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
428  { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
429  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
430  { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
431  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
432  { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
433 \
434  __asm__ volatile(\
435  "pxor %%mm0, %%mm0 \n\t"\
436  ::: "memory"\
437  );\
438 \
439  if (vmode) { /* Vertical filter to apply */\
440  if (hmode) { /* Horizontal filter to apply, output to tmp */\
441  static const int shift_value[] = { 0, 5, 1, 5 };\
442  int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
443  int r;\
444  DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
445 \
446  r = (1<<(shift-1)) + rnd-1;\
447  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
448 \
449  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
450  return;\
451  }\
452  else { /* No horizontal filter, output 8 lines to dst */\
453  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
454  return;\
455  }\
456  }\
457 \
458  /* Horizontal mode with no vertical mode */\
459  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
460 }
461 
462 VC1_MSPEL_MC(put_)
463 VC1_MSPEL_MC(avg_)
464 
466 #define DECLARE_FUNCTION(a, b) \
467 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
468  const uint8_t *src, \
469  ptrdiff_t stride, \
470  int rnd) \
471 { \
472  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
473 }\
474 static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
475  const uint8_t *src, \
476  ptrdiff_t stride, \
477  int rnd) \
478 { \
479  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
480 }
481 
482 DECLARE_FUNCTION(0, 1)
483 DECLARE_FUNCTION(0, 2)
484 DECLARE_FUNCTION(0, 3)
485 
486 DECLARE_FUNCTION(1, 0)
487 DECLARE_FUNCTION(1, 1)
488 DECLARE_FUNCTION(1, 2)
489 DECLARE_FUNCTION(1, 3)
490 
491 DECLARE_FUNCTION(2, 0)
492 DECLARE_FUNCTION(2, 1)
493 DECLARE_FUNCTION(2, 2)
494 DECLARE_FUNCTION(2, 3)
495 
496 DECLARE_FUNCTION(3, 0)
497 DECLARE_FUNCTION(3, 1)
498 DECLARE_FUNCTION(3, 2)
499 DECLARE_FUNCTION(3, 3)
500 
501 static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
502  int16_t *block)
503 {
504  int dc = block[0];
505  dc = (17 * dc + 4) >> 3;
506  dc = (17 * dc + 64) >> 7;
507  __asm__ volatile(
508  "movd %0, %%mm0 \n\t"
509  "pshufw $0, %%mm0, %%mm0 \n\t"
510  "pxor %%mm1, %%mm1 \n\t"
511  "psubw %%mm0, %%mm1 \n\t"
512  "packuswb %%mm0, %%mm0 \n\t"
513  "packuswb %%mm1, %%mm1 \n\t"
514  ::"r"(dc)
515  );
516  __asm__ volatile(
517  "movd %0, %%mm2 \n\t"
518  "movd %1, %%mm3 \n\t"
519  "movd %2, %%mm4 \n\t"
520  "movd %3, %%mm5 \n\t"
521  "paddusb %%mm0, %%mm2 \n\t"
522  "paddusb %%mm0, %%mm3 \n\t"
523  "paddusb %%mm0, %%mm4 \n\t"
524  "paddusb %%mm0, %%mm5 \n\t"
525  "psubusb %%mm1, %%mm2 \n\t"
526  "psubusb %%mm1, %%mm3 \n\t"
527  "psubusb %%mm1, %%mm4 \n\t"
528  "psubusb %%mm1, %%mm5 \n\t"
529  "movd %%mm2, %0 \n\t"
530  "movd %%mm3, %1 \n\t"
531  "movd %%mm4, %2 \n\t"
532  "movd %%mm5, %3 \n\t"
533  :"+m"(*(uint32_t*)(dest+0*linesize)),
534  "+m"(*(uint32_t*)(dest+1*linesize)),
535  "+m"(*(uint32_t*)(dest+2*linesize)),
536  "+m"(*(uint32_t*)(dest+3*linesize))
537  );
538 }
539 
540 static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
541  int16_t *block)
542 {
543  int dc = block[0];
544  dc = (17 * dc + 4) >> 3;
545  dc = (12 * dc + 64) >> 7;
546  __asm__ volatile(
547  "movd %0, %%mm0 \n\t"
548  "pshufw $0, %%mm0, %%mm0 \n\t"
549  "pxor %%mm1, %%mm1 \n\t"
550  "psubw %%mm0, %%mm1 \n\t"
551  "packuswb %%mm0, %%mm0 \n\t"
552  "packuswb %%mm1, %%mm1 \n\t"
553  ::"r"(dc)
554  );
555  __asm__ volatile(
556  "movd %0, %%mm2 \n\t"
557  "movd %1, %%mm3 \n\t"
558  "movd %2, %%mm4 \n\t"
559  "movd %3, %%mm5 \n\t"
560  "paddusb %%mm0, %%mm2 \n\t"
561  "paddusb %%mm0, %%mm3 \n\t"
562  "paddusb %%mm0, %%mm4 \n\t"
563  "paddusb %%mm0, %%mm5 \n\t"
564  "psubusb %%mm1, %%mm2 \n\t"
565  "psubusb %%mm1, %%mm3 \n\t"
566  "psubusb %%mm1, %%mm4 \n\t"
567  "psubusb %%mm1, %%mm5 \n\t"
568  "movd %%mm2, %0 \n\t"
569  "movd %%mm3, %1 \n\t"
570  "movd %%mm4, %2 \n\t"
571  "movd %%mm5, %3 \n\t"
572  :"+m"(*(uint32_t*)(dest+0*linesize)),
573  "+m"(*(uint32_t*)(dest+1*linesize)),
574  "+m"(*(uint32_t*)(dest+2*linesize)),
575  "+m"(*(uint32_t*)(dest+3*linesize))
576  );
577  dest += 4*linesize;
578  __asm__ volatile(
579  "movd %0, %%mm2 \n\t"
580  "movd %1, %%mm3 \n\t"
581  "movd %2, %%mm4 \n\t"
582  "movd %3, %%mm5 \n\t"
583  "paddusb %%mm0, %%mm2 \n\t"
584  "paddusb %%mm0, %%mm3 \n\t"
585  "paddusb %%mm0, %%mm4 \n\t"
586  "paddusb %%mm0, %%mm5 \n\t"
587  "psubusb %%mm1, %%mm2 \n\t"
588  "psubusb %%mm1, %%mm3 \n\t"
589  "psubusb %%mm1, %%mm4 \n\t"
590  "psubusb %%mm1, %%mm5 \n\t"
591  "movd %%mm2, %0 \n\t"
592  "movd %%mm3, %1 \n\t"
593  "movd %%mm4, %2 \n\t"
594  "movd %%mm5, %3 \n\t"
595  :"+m"(*(uint32_t*)(dest+0*linesize)),
596  "+m"(*(uint32_t*)(dest+1*linesize)),
597  "+m"(*(uint32_t*)(dest+2*linesize)),
598  "+m"(*(uint32_t*)(dest+3*linesize))
599  );
600 }
601 
602 static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
603  int16_t *block)
604 {
605  int dc = block[0];
606  dc = ( 3 * dc + 1) >> 1;
607  dc = (17 * dc + 64) >> 7;
608  __asm__ volatile(
609  "movd %0, %%mm0 \n\t"
610  "pshufw $0, %%mm0, %%mm0 \n\t"
611  "pxor %%mm1, %%mm1 \n\t"
612  "psubw %%mm0, %%mm1 \n\t"
613  "packuswb %%mm0, %%mm0 \n\t"
614  "packuswb %%mm1, %%mm1 \n\t"
615  ::"r"(dc)
616  );
617  __asm__ volatile(
618  "movq %0, %%mm2 \n\t"
619  "movq %1, %%mm3 \n\t"
620  "movq %2, %%mm4 \n\t"
621  "movq %3, %%mm5 \n\t"
622  "paddusb %%mm0, %%mm2 \n\t"
623  "paddusb %%mm0, %%mm3 \n\t"
624  "paddusb %%mm0, %%mm4 \n\t"
625  "paddusb %%mm0, %%mm5 \n\t"
626  "psubusb %%mm1, %%mm2 \n\t"
627  "psubusb %%mm1, %%mm3 \n\t"
628  "psubusb %%mm1, %%mm4 \n\t"
629  "psubusb %%mm1, %%mm5 \n\t"
630  "movq %%mm2, %0 \n\t"
631  "movq %%mm3, %1 \n\t"
632  "movq %%mm4, %2 \n\t"
633  "movq %%mm5, %3 \n\t"
634  :"+m"(*(uint32_t*)(dest+0*linesize)),
635  "+m"(*(uint32_t*)(dest+1*linesize)),
636  "+m"(*(uint32_t*)(dest+2*linesize)),
637  "+m"(*(uint32_t*)(dest+3*linesize))
638  );
639 }
640 
641 static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
642  int16_t *block)
643 {
644  int dc = block[0];
645  dc = (3 * dc + 1) >> 1;
646  dc = (3 * dc + 16) >> 5;
647  __asm__ volatile(
648  "movd %0, %%mm0 \n\t"
649  "pshufw $0, %%mm0, %%mm0 \n\t"
650  "pxor %%mm1, %%mm1 \n\t"
651  "psubw %%mm0, %%mm1 \n\t"
652  "packuswb %%mm0, %%mm0 \n\t"
653  "packuswb %%mm1, %%mm1 \n\t"
654  ::"r"(dc)
655  );
656  __asm__ volatile(
657  "movq %0, %%mm2 \n\t"
658  "movq %1, %%mm3 \n\t"
659  "movq %2, %%mm4 \n\t"
660  "movq %3, %%mm5 \n\t"
661  "paddusb %%mm0, %%mm2 \n\t"
662  "paddusb %%mm0, %%mm3 \n\t"
663  "paddusb %%mm0, %%mm4 \n\t"
664  "paddusb %%mm0, %%mm5 \n\t"
665  "psubusb %%mm1, %%mm2 \n\t"
666  "psubusb %%mm1, %%mm3 \n\t"
667  "psubusb %%mm1, %%mm4 \n\t"
668  "psubusb %%mm1, %%mm5 \n\t"
669  "movq %%mm2, %0 \n\t"
670  "movq %%mm3, %1 \n\t"
671  "movq %%mm4, %2 \n\t"
672  "movq %%mm5, %3 \n\t"
673  :"+m"(*(uint32_t*)(dest+0*linesize)),
674  "+m"(*(uint32_t*)(dest+1*linesize)),
675  "+m"(*(uint32_t*)(dest+2*linesize)),
676  "+m"(*(uint32_t*)(dest+3*linesize))
677  );
678  dest += 4*linesize;
679  __asm__ volatile(
680  "movq %0, %%mm2 \n\t"
681  "movq %1, %%mm3 \n\t"
682  "movq %2, %%mm4 \n\t"
683  "movq %3, %%mm5 \n\t"
684  "paddusb %%mm0, %%mm2 \n\t"
685  "paddusb %%mm0, %%mm3 \n\t"
686  "paddusb %%mm0, %%mm4 \n\t"
687  "paddusb %%mm0, %%mm5 \n\t"
688  "psubusb %%mm1, %%mm2 \n\t"
689  "psubusb %%mm1, %%mm3 \n\t"
690  "psubusb %%mm1, %%mm4 \n\t"
691  "psubusb %%mm1, %%mm5 \n\t"
692  "movq %%mm2, %0 \n\t"
693  "movq %%mm3, %1 \n\t"
694  "movq %%mm4, %2 \n\t"
695  "movq %%mm5, %3 \n\t"
696  :"+m"(*(uint32_t*)(dest+0*linesize)),
697  "+m"(*(uint32_t*)(dest+1*linesize)),
698  "+m"(*(uint32_t*)(dest+2*linesize)),
699  "+m"(*(uint32_t*)(dest+3*linesize))
700  );
701 }
702 
703 static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
704  ptrdiff_t stride, int rnd)
705 {
706  ff_put_pixels8_mmx(dst, src, stride, 8);
707 }
708 
710 {
711  dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
712  dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
713  dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
714  dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
715 
716  dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
717  dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
718  dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
719  dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
720 
721  dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
722  dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
723  dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
724  dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
725 
726  dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
727  dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
728  dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
729  dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
730 }
731 
733 {
734  dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
735  dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
736  dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
737 
738  dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
739  dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
740  dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
741  dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
742 
743  dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
744  dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
745  dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
746  dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
747 
748  dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
749  dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
750  dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
751  dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
752 
753  dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
754  dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
755  dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
756  dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
757 }
758 #endif /* HAVE_INLINE_ASM */
op_pixels_func avg_vc1_mspel_pixels_tab[16]
Definition: vc1dsp.h:59
VC-1 and WMV3 decoder.
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
memory handling functions
const xmm_reg ff_pw_9
Definition: constants.c:33
int stride
Definition: mace.c:144
void(* vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:41
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
uint8_t
#define av_cold
Definition: attributes.h:66
int x86_reg
Definition: asm.h:70
#define b
Definition: input.c:52
#define MANGLE(a)
Definition: asm.h:110
#define r
Definition: input.c:51
static const int shift1[6]
Definition: dxa.c:48
void(* vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:40
common internal API header
void(* vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:42
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
#define VC1_MSPEL_MC(OP, OPNAME)
Definition: vc1dsp.c:585
#define OP_AVG(x, s, l)
#define OP_PUT(a, b)
Definition: ivi_dsp.c:826
void(* vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:43
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_dlog(ac->avr, "%d samples - audio_convert: %s to %s (dithered)\", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> dc
op_pixels_func put_vc1_mspel_pixels_tab[16]
Definition: vc1dsp.h:58
static int16_t block[64]
Definition: dct-test.c:88