24 #include "libavutil/mem.h"
41 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
46 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51 #define COL_SHIFT 20 // 6
57 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
60 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
87 int16_t *
const temp= (int16_t*)align_tmp;
90 #if 0 //Alternative, simpler variant
92 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
93 "movq " #src0 ", %%mm0 \n\t" \
94 "movq " #src4 ", %%mm1 \n\t" \
95 "movq " #src1 ", %%mm2 \n\t" \
96 "movq " #src5 ", %%mm3 \n\t" \
97 "movq 16(%2), %%mm4 \n\t" \
98 "pmaddwd %%mm0, %%mm4 \n\t" \
99 "movq 24(%2), %%mm5 \n\t" \
100 "pmaddwd %%mm5, %%mm0 \n\t" \
101 "movq 32(%2), %%mm5 \n\t" \
102 "pmaddwd %%mm1, %%mm5 \n\t" \
103 "movq 40(%2), %%mm6 \n\t" \
104 "pmaddwd %%mm6, %%mm1 \n\t" \
105 "movq 48(%2), %%mm7 \n\t" \
106 "pmaddwd %%mm2, %%mm7 \n\t" \
107 #rounder ", %%mm4 \n\t"\
108 "movq %%mm4, %%mm6 \n\t" \
109 "paddd %%mm5, %%mm4 \n\t" \
110 "psubd %%mm5, %%mm6 \n\t" \
111 "movq 56(%2), %%mm5 \n\t" \
112 "pmaddwd %%mm3, %%mm5 \n\t" \
113 #rounder ", %%mm0 \n\t"\
114 "paddd %%mm0, %%mm1 \n\t" \
115 "paddd %%mm0, %%mm0 \n\t" \
116 "psubd %%mm1, %%mm0 \n\t" \
117 "pmaddwd 64(%2), %%mm2 \n\t" \
118 "paddd %%mm5, %%mm7 \n\t" \
119 "movq 72(%2), %%mm5 \n\t" \
120 "pmaddwd %%mm3, %%mm5 \n\t" \
121 "paddd %%mm4, %%mm7 \n\t" \
122 "paddd %%mm4, %%mm4 \n\t" \
123 "psubd %%mm7, %%mm4 \n\t" \
124 "paddd %%mm2, %%mm5 \n\t" \
125 "psrad $" #shift ", %%mm7 \n\t"\
126 "psrad $" #shift ", %%mm4 \n\t"\
127 "movq %%mm1, %%mm2 \n\t" \
128 "paddd %%mm5, %%mm1 \n\t" \
129 "psubd %%mm5, %%mm2 \n\t" \
130 "psrad $" #shift ", %%mm1 \n\t"\
131 "psrad $" #shift ", %%mm2 \n\t"\
132 "packssdw %%mm1, %%mm7 \n\t" \
133 "packssdw %%mm4, %%mm2 \n\t" \
134 "movq %%mm7, " #dst " \n\t"\
135 "movq " #src1 ", %%mm1 \n\t" \
136 "movq 80(%2), %%mm4 \n\t" \
137 "movq %%mm2, 24+" #dst " \n\t"\
138 "pmaddwd %%mm1, %%mm4 \n\t" \
139 "movq 88(%2), %%mm7 \n\t" \
140 "pmaddwd 96(%2), %%mm1 \n\t" \
141 "pmaddwd %%mm3, %%mm7 \n\t" \
142 "movq %%mm0, %%mm2 \n\t" \
143 "pmaddwd 104(%2), %%mm3 \n\t" \
144 "paddd %%mm7, %%mm4 \n\t" \
145 "paddd %%mm4, %%mm2 \n\t" \
146 "psubd %%mm4, %%mm0 \n\t" \
147 "psrad $" #shift ", %%mm2 \n\t"\
148 "psrad $" #shift ", %%mm0 \n\t"\
149 "movq %%mm6, %%mm4 \n\t" \
150 "paddd %%mm1, %%mm3 \n\t" \
151 "paddd %%mm3, %%mm6 \n\t" \
152 "psubd %%mm3, %%mm4 \n\t" \
153 "psrad $" #shift ", %%mm6 \n\t"\
154 "packssdw %%mm6, %%mm2 \n\t" \
155 "movq %%mm2, 8+" #dst " \n\t"\
156 "psrad $" #shift ", %%mm4 \n\t"\
157 "packssdw %%mm0, %%mm4 \n\t" \
158 "movq %%mm4, 16+" #dst " \n\t"\
160 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
161 "movq " #src0 ", %%mm0 \n\t" \
162 "movq " #src4 ", %%mm1 \n\t" \
163 "movq " #src1 ", %%mm2 \n\t" \
164 "movq " #src5 ", %%mm3 \n\t" \
165 "movq 16(%2), %%mm4 \n\t" \
166 "pmaddwd %%mm0, %%mm4 \n\t" \
167 "movq 24(%2), %%mm5 \n\t" \
168 "pmaddwd %%mm5, %%mm0 \n\t" \
169 "movq 32(%2), %%mm5 \n\t" \
170 "pmaddwd %%mm1, %%mm5 \n\t" \
171 "movq 40(%2), %%mm6 \n\t" \
172 "pmaddwd %%mm6, %%mm1 \n\t" \
173 "movq %%mm4, %%mm6 \n\t" \
174 "movq 48(%2), %%mm7 \n\t" \
175 "pmaddwd %%mm2, %%mm7 \n\t" \
176 "paddd %%mm5, %%mm4 \n\t" \
177 "psubd %%mm5, %%mm6 \n\t" \
178 "movq %%mm0, %%mm5 \n\t" \
179 "paddd %%mm1, %%mm0 \n\t" \
180 "psubd %%mm1, %%mm5 \n\t" \
181 "movq 56(%2), %%mm1 \n\t" \
182 "pmaddwd %%mm3, %%mm1 \n\t" \
183 "pmaddwd 64(%2), %%mm2 \n\t" \
184 "paddd %%mm1, %%mm7 \n\t" \
185 "movq 72(%2), %%mm1 \n\t" \
186 "pmaddwd %%mm3, %%mm1 \n\t" \
187 "paddd %%mm4, %%mm7 \n\t" \
188 "paddd %%mm4, %%mm4 \n\t" \
189 "psubd %%mm7, %%mm4 \n\t" \
190 "paddd %%mm2, %%mm1 \n\t" \
191 "psrad $" #shift ", %%mm7 \n\t"\
192 "psrad $" #shift ", %%mm4 \n\t"\
193 "movq %%mm0, %%mm2 \n\t" \
194 "paddd %%mm1, %%mm0 \n\t" \
195 "psubd %%mm1, %%mm2 \n\t" \
196 "psrad $" #shift ", %%mm0 \n\t"\
197 "psrad $" #shift ", %%mm2 \n\t"\
198 "packssdw %%mm7, %%mm7 \n\t" \
199 "movd %%mm7, " #dst " \n\t"\
200 "packssdw %%mm0, %%mm0 \n\t" \
201 "movd %%mm0, 16+" #dst " \n\t"\
202 "packssdw %%mm2, %%mm2 \n\t" \
203 "movd %%mm2, 96+" #dst " \n\t"\
204 "packssdw %%mm4, %%mm4 \n\t" \
205 "movd %%mm4, 112+" #dst " \n\t"\
206 "movq " #src1 ", %%mm0 \n\t" \
207 "movq 80(%2), %%mm4 \n\t" \
208 "pmaddwd %%mm0, %%mm4 \n\t" \
209 "movq 88(%2), %%mm7 \n\t" \
210 "pmaddwd 96(%2), %%mm0 \n\t" \
211 "pmaddwd %%mm3, %%mm7 \n\t" \
212 "movq %%mm5, %%mm2 \n\t" \
213 "pmaddwd 104(%2), %%mm3 \n\t" \
214 "paddd %%mm7, %%mm4 \n\t" \
215 "paddd %%mm4, %%mm2 \n\t" \
216 "psubd %%mm4, %%mm5 \n\t" \
217 "psrad $" #shift ", %%mm2 \n\t"\
218 "psrad $" #shift ", %%mm5 \n\t"\
219 "movq %%mm6, %%mm4 \n\t" \
220 "paddd %%mm0, %%mm3 \n\t" \
221 "paddd %%mm3, %%mm6 \n\t" \
222 "psubd %%mm3, %%mm4 \n\t" \
223 "psrad $" #shift ", %%mm6 \n\t"\
224 "psrad $" #shift ", %%mm4 \n\t"\
225 "packssdw %%mm2, %%mm2 \n\t" \
226 "packssdw %%mm6, %%mm6 \n\t" \
227 "movd %%mm2, 32+" #dst " \n\t"\
228 "packssdw %%mm4, %%mm4 \n\t" \
229 "packssdw %%mm5, %%mm5 \n\t" \
230 "movd %%mm6, 48+" #dst " \n\t"\
231 "movd %%mm4, 64+" #dst " \n\t"\
232 "movd %%mm5, 80+" #dst " \n\t"\
235 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
236 "movq " #src0 ", %%mm0 \n\t" \
237 "movq " #src4 ", %%mm1 \n\t" \
238 "movq " #src1 ", %%mm2 \n\t" \
239 "movq " #src5 ", %%mm3 \n\t" \
240 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
241 "pand %%mm0, %%mm4 \n\t"\
242 "por %%mm1, %%mm4 \n\t"\
243 "por %%mm2, %%mm4 \n\t"\
244 "por %%mm3, %%mm4 \n\t"\
245 "packssdw %%mm4,%%mm4 \n\t"\
246 "movd %%mm4, %%eax \n\t"\
247 "orl %%eax, %%eax \n\t"\
249 "movq 16(%2), %%mm4 \n\t" \
250 "pmaddwd %%mm0, %%mm4 \n\t" \
251 "movq 24(%2), %%mm5 \n\t" \
252 "pmaddwd %%mm5, %%mm0 \n\t" \
253 "movq 32(%2), %%mm5 \n\t" \
254 "pmaddwd %%mm1, %%mm5 \n\t" \
255 "movq 40(%2), %%mm6 \n\t" \
256 "pmaddwd %%mm6, %%mm1 \n\t" \
257 "movq 48(%2), %%mm7 \n\t" \
258 "pmaddwd %%mm2, %%mm7 \n\t" \
259 #rounder ", %%mm4 \n\t"\
260 "movq %%mm4, %%mm6 \n\t" \
261 "paddd %%mm5, %%mm4 \n\t" \
262 "psubd %%mm5, %%mm6 \n\t" \
263 "movq 56(%2), %%mm5 \n\t" \
264 "pmaddwd %%mm3, %%mm5 \n\t" \
265 #rounder ", %%mm0 \n\t"\
266 "paddd %%mm0, %%mm1 \n\t" \
267 "paddd %%mm0, %%mm0 \n\t" \
268 "psubd %%mm1, %%mm0 \n\t" \
269 "pmaddwd 64(%2), %%mm2 \n\t" \
270 "paddd %%mm5, %%mm7 \n\t" \
271 "movq 72(%2), %%mm5 \n\t" \
272 "pmaddwd %%mm3, %%mm5 \n\t" \
273 "paddd %%mm4, %%mm7 \n\t" \
274 "paddd %%mm4, %%mm4 \n\t" \
275 "psubd %%mm7, %%mm4 \n\t" \
276 "paddd %%mm2, %%mm5 \n\t" \
277 "psrad $" #shift ", %%mm7 \n\t"\
278 "psrad $" #shift ", %%mm4 \n\t"\
279 "movq %%mm1, %%mm2 \n\t" \
280 "paddd %%mm5, %%mm1 \n\t" \
281 "psubd %%mm5, %%mm2 \n\t" \
282 "psrad $" #shift ", %%mm1 \n\t"\
283 "psrad $" #shift ", %%mm2 \n\t"\
284 "packssdw %%mm1, %%mm7 \n\t" \
285 "packssdw %%mm4, %%mm2 \n\t" \
286 "movq %%mm7, " #dst " \n\t"\
287 "movq " #src1 ", %%mm1 \n\t" \
288 "movq 80(%2), %%mm4 \n\t" \
289 "movq %%mm2, 24+" #dst " \n\t"\
290 "pmaddwd %%mm1, %%mm4 \n\t" \
291 "movq 88(%2), %%mm7 \n\t" \
292 "pmaddwd 96(%2), %%mm1 \n\t" \
293 "pmaddwd %%mm3, %%mm7 \n\t" \
294 "movq %%mm0, %%mm2 \n\t" \
295 "pmaddwd 104(%2), %%mm3 \n\t" \
296 "paddd %%mm7, %%mm4 \n\t" \
297 "paddd %%mm4, %%mm2 \n\t" \
298 "psubd %%mm4, %%mm0 \n\t" \
299 "psrad $" #shift ", %%mm2 \n\t"\
300 "psrad $" #shift ", %%mm0 \n\t"\
301 "movq %%mm6, %%mm4 \n\t" \
302 "paddd %%mm1, %%mm3 \n\t" \
303 "paddd %%mm3, %%mm6 \n\t" \
304 "psubd %%mm3, %%mm4 \n\t" \
305 "psrad $" #shift ", %%mm6 \n\t"\
306 "packssdw %%mm6, %%mm2 \n\t" \
307 "movq %%mm2, 8+" #dst " \n\t"\
308 "psrad $" #shift ", %%mm4 \n\t"\
309 "packssdw %%mm0, %%mm4 \n\t" \
310 "movq %%mm4, 16+" #dst " \n\t"\
313 "pslld $16, %%mm0 \n\t"\
314 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
315 "psrad $13, %%mm0 \n\t"\
316 "packssdw %%mm0, %%mm0 \n\t"\
317 "movq %%mm0, " #dst " \n\t"\
318 "movq %%mm0, 8+" #dst " \n\t"\
319 "movq %%mm0, 16+" #dst " \n\t"\
320 "movq %%mm0, 24+" #dst " \n\t"\
325 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
330 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
331 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
332 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
336 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
337 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
338 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
339 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
343 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
344 "movq " #src0 ", %%mm0 \n\t" \
345 "movq " #src4 ", %%mm1 \n\t" \
346 "movq " #src1 ", %%mm2 \n\t" \
347 "movq " #src5 ", %%mm3 \n\t" \
348 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
349 "pand %%mm0, %%mm4 \n\t"\
350 "por %%mm1, %%mm4 \n\t"\
351 "por %%mm2, %%mm4 \n\t"\
352 "por %%mm3, %%mm4 \n\t"\
353 "packssdw %%mm4,%%mm4 \n\t"\
354 "movd %%mm4, %%eax \n\t"\
355 "orl %%eax, %%eax \n\t"\
357 "movq 16(%2), %%mm4 \n\t" \
358 "pmaddwd %%mm0, %%mm4 \n\t" \
359 "movq 24(%2), %%mm5 \n\t" \
360 "pmaddwd %%mm5, %%mm0 \n\t" \
361 "movq 32(%2), %%mm5 \n\t" \
362 "pmaddwd %%mm1, %%mm5 \n\t" \
363 "movq 40(%2), %%mm6 \n\t" \
364 "pmaddwd %%mm6, %%mm1 \n\t" \
365 "movq 48(%2), %%mm7 \n\t" \
366 "pmaddwd %%mm2, %%mm7 \n\t" \
367 #rounder ", %%mm4 \n\t"\
368 "movq %%mm4, %%mm6 \n\t" \
369 "paddd %%mm5, %%mm4 \n\t" \
370 "psubd %%mm5, %%mm6 \n\t" \
371 "movq 56(%2), %%mm5 \n\t" \
372 "pmaddwd %%mm3, %%mm5 \n\t" \
373 #rounder ", %%mm0 \n\t"\
374 "paddd %%mm0, %%mm1 \n\t" \
375 "paddd %%mm0, %%mm0 \n\t" \
376 "psubd %%mm1, %%mm0 \n\t" \
377 "pmaddwd 64(%2), %%mm2 \n\t" \
378 "paddd %%mm5, %%mm7 \n\t" \
379 "movq 72(%2), %%mm5 \n\t" \
380 "pmaddwd %%mm3, %%mm5 \n\t" \
381 "paddd %%mm4, %%mm7 \n\t" \
382 "paddd %%mm4, %%mm4 \n\t" \
383 "psubd %%mm7, %%mm4 \n\t" \
384 "paddd %%mm2, %%mm5 \n\t" \
385 "psrad $" #shift ", %%mm7 \n\t"\
386 "psrad $" #shift ", %%mm4 \n\t"\
387 "movq %%mm1, %%mm2 \n\t" \
388 "paddd %%mm5, %%mm1 \n\t" \
389 "psubd %%mm5, %%mm2 \n\t" \
390 "psrad $" #shift ", %%mm1 \n\t"\
391 "psrad $" #shift ", %%mm2 \n\t"\
392 "packssdw %%mm1, %%mm7 \n\t" \
393 "packssdw %%mm4, %%mm2 \n\t" \
394 "movq %%mm7, " #dst " \n\t"\
395 "movq " #src1 ", %%mm1 \n\t" \
396 "movq 80(%2), %%mm4 \n\t" \
397 "movq %%mm2, 24+" #dst " \n\t"\
398 "pmaddwd %%mm1, %%mm4 \n\t" \
399 "movq 88(%2), %%mm7 \n\t" \
400 "pmaddwd 96(%2), %%mm1 \n\t" \
401 "pmaddwd %%mm3, %%mm7 \n\t" \
402 "movq %%mm0, %%mm2 \n\t" \
403 "pmaddwd 104(%2), %%mm3 \n\t" \
404 "paddd %%mm7, %%mm4 \n\t" \
405 "paddd %%mm4, %%mm2 \n\t" \
406 "psubd %%mm4, %%mm0 \n\t" \
407 "psrad $" #shift ", %%mm2 \n\t"\
408 "psrad $" #shift ", %%mm0 \n\t"\
409 "movq %%mm6, %%mm4 \n\t" \
410 "paddd %%mm1, %%mm3 \n\t" \
411 "paddd %%mm3, %%mm6 \n\t" \
412 "psubd %%mm3, %%mm4 \n\t" \
413 "psrad $" #shift ", %%mm6 \n\t"\
414 "packssdw %%mm6, %%mm2 \n\t" \
415 "movq %%mm2, 8+" #dst " \n\t"\
416 "psrad $" #shift ", %%mm4 \n\t"\
417 "packssdw %%mm0, %%mm4 \n\t" \
418 "movq %%mm4, 16+" #dst " \n\t"\
421 "pslld $16, %%mm0 \n\t"\
422 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
423 "psrad $13, %%mm0 \n\t"\
424 "packssdw %%mm0, %%mm0 \n\t"\
425 "movq %%mm0, " #dst " \n\t"\
426 "movq %%mm0, 8+" #dst " \n\t"\
427 "movq %%mm0, 16+" #dst " \n\t"\
428 "movq %%mm0, 24+" #dst " \n\t"\
431 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
432 "movq " #src0 ", %%mm0 \n\t" \
433 "movq " #src4 ", %%mm1 \n\t" \
434 "movq " #src1 ", %%mm2 \n\t" \
435 "movq " #src5 ", %%mm3 \n\t" \
436 "movq %%mm0, %%mm4 \n\t"\
437 "por %%mm1, %%mm4 \n\t"\
438 "por %%mm2, %%mm4 \n\t"\
439 "por %%mm3, %%mm4 \n\t"\
440 "packssdw %%mm4,%%mm4 \n\t"\
441 "movd %%mm4, %%eax \n\t"\
442 "orl %%eax, %%eax \n\t"\
444 "movq 16(%2), %%mm4 \n\t" \
445 "pmaddwd %%mm0, %%mm4 \n\t" \
446 "movq 24(%2), %%mm5 \n\t" \
447 "pmaddwd %%mm5, %%mm0 \n\t" \
448 "movq 32(%2), %%mm5 \n\t" \
449 "pmaddwd %%mm1, %%mm5 \n\t" \
450 "movq 40(%2), %%mm6 \n\t" \
451 "pmaddwd %%mm6, %%mm1 \n\t" \
452 "movq 48(%2), %%mm7 \n\t" \
453 "pmaddwd %%mm2, %%mm7 \n\t" \
454 #rounder ", %%mm4 \n\t"\
455 "movq %%mm4, %%mm6 \n\t" \
456 "paddd %%mm5, %%mm4 \n\t" \
457 "psubd %%mm5, %%mm6 \n\t" \
458 "movq 56(%2), %%mm5 \n\t" \
459 "pmaddwd %%mm3, %%mm5 \n\t" \
460 #rounder ", %%mm0 \n\t"\
461 "paddd %%mm0, %%mm1 \n\t" \
462 "paddd %%mm0, %%mm0 \n\t" \
463 "psubd %%mm1, %%mm0 \n\t" \
464 "pmaddwd 64(%2), %%mm2 \n\t" \
465 "paddd %%mm5, %%mm7 \n\t" \
466 "movq 72(%2), %%mm5 \n\t" \
467 "pmaddwd %%mm3, %%mm5 \n\t" \
468 "paddd %%mm4, %%mm7 \n\t" \
469 "paddd %%mm4, %%mm4 \n\t" \
470 "psubd %%mm7, %%mm4 \n\t" \
471 "paddd %%mm2, %%mm5 \n\t" \
472 "psrad $" #shift ", %%mm7 \n\t"\
473 "psrad $" #shift ", %%mm4 \n\t"\
474 "movq %%mm1, %%mm2 \n\t" \
475 "paddd %%mm5, %%mm1 \n\t" \
476 "psubd %%mm5, %%mm2 \n\t" \
477 "psrad $" #shift ", %%mm1 \n\t"\
478 "psrad $" #shift ", %%mm2 \n\t"\
479 "packssdw %%mm1, %%mm7 \n\t" \
480 "packssdw %%mm4, %%mm2 \n\t" \
481 "movq %%mm7, " #dst " \n\t"\
482 "movq " #src1 ", %%mm1 \n\t" \
483 "movq 80(%2), %%mm4 \n\t" \
484 "movq %%mm2, 24+" #dst " \n\t"\
485 "pmaddwd %%mm1, %%mm4 \n\t" \
486 "movq 88(%2), %%mm7 \n\t" \
487 "pmaddwd 96(%2), %%mm1 \n\t" \
488 "pmaddwd %%mm3, %%mm7 \n\t" \
489 "movq %%mm0, %%mm2 \n\t" \
490 "pmaddwd 104(%2), %%mm3 \n\t" \
491 "paddd %%mm7, %%mm4 \n\t" \
492 "paddd %%mm4, %%mm2 \n\t" \
493 "psubd %%mm4, %%mm0 \n\t" \
494 "psrad $" #shift ", %%mm2 \n\t"\
495 "psrad $" #shift ", %%mm0 \n\t"\
496 "movq %%mm6, %%mm4 \n\t" \
497 "paddd %%mm1, %%mm3 \n\t" \
498 "paddd %%mm3, %%mm6 \n\t" \
499 "psubd %%mm3, %%mm4 \n\t" \
500 "psrad $" #shift ", %%mm6 \n\t"\
501 "packssdw %%mm6, %%mm2 \n\t" \
502 "movq %%mm2, 8+" #dst " \n\t"\
503 "psrad $" #shift ", %%mm4 \n\t"\
504 "packssdw %%mm0, %%mm4 \n\t" \
505 "movq %%mm4, 16+" #dst " \n\t"\
507 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
508 "movq " #src0 ", %%mm0 \n\t" \
509 "movq " #src4 ", %%mm1 \n\t" \
510 "movq " #src1 ", %%mm2 \n\t" \
511 "movq " #src5 ", %%mm3 \n\t" \
512 "movq 16(%2), %%mm4 \n\t" \
513 "pmaddwd %%mm0, %%mm4 \n\t" \
514 "movq 24(%2), %%mm5 \n\t" \
515 "pmaddwd %%mm5, %%mm0 \n\t" \
516 "movq 32(%2), %%mm5 \n\t" \
517 "pmaddwd %%mm1, %%mm5 \n\t" \
518 "movq 40(%2), %%mm6 \n\t" \
519 "pmaddwd %%mm6, %%mm1 \n\t" \
520 "movq 48(%2), %%mm7 \n\t" \
521 "pmaddwd %%mm2, %%mm7 \n\t" \
522 #rounder ", %%mm4 \n\t"\
523 "movq %%mm4, %%mm6 \n\t" \
524 "paddd %%mm5, %%mm4 \n\t" \
525 "psubd %%mm5, %%mm6 \n\t" \
526 "movq 56(%2), %%mm5 \n\t" \
527 "pmaddwd %%mm3, %%mm5 \n\t" \
528 #rounder ", %%mm0 \n\t"\
529 "paddd %%mm0, %%mm1 \n\t" \
530 "paddd %%mm0, %%mm0 \n\t" \
531 "psubd %%mm1, %%mm0 \n\t" \
532 "pmaddwd 64(%2), %%mm2 \n\t" \
533 "paddd %%mm5, %%mm7 \n\t" \
534 "movq 72(%2), %%mm5 \n\t" \
535 "pmaddwd %%mm3, %%mm5 \n\t" \
536 "paddd %%mm4, %%mm7 \n\t" \
537 "paddd %%mm4, %%mm4 \n\t" \
538 "psubd %%mm7, %%mm4 \n\t" \
539 "paddd %%mm2, %%mm5 \n\t" \
540 "psrad $" #shift ", %%mm7 \n\t"\
541 "psrad $" #shift ", %%mm4 \n\t"\
542 "movq %%mm1, %%mm2 \n\t" \
543 "paddd %%mm5, %%mm1 \n\t" \
544 "psubd %%mm5, %%mm2 \n\t" \
545 "psrad $" #shift ", %%mm1 \n\t"\
546 "psrad $" #shift ", %%mm2 \n\t"\
547 "packssdw %%mm1, %%mm7 \n\t" \
548 "packssdw %%mm4, %%mm2 \n\t" \
549 "movq %%mm7, " #dst " \n\t"\
550 "movq " #src1 ", %%mm1 \n\t" \
551 "movq 80(%2), %%mm4 \n\t" \
552 "movq %%mm2, 24+" #dst " \n\t"\
553 "pmaddwd %%mm1, %%mm4 \n\t" \
554 "movq 88(%2), %%mm7 \n\t" \
555 "pmaddwd 96(%2), %%mm1 \n\t" \
556 "pmaddwd %%mm3, %%mm7 \n\t" \
557 "movq %%mm0, %%mm2 \n\t" \
558 "pmaddwd 104(%2), %%mm3 \n\t" \
559 "paddd %%mm7, %%mm4 \n\t" \
560 "paddd %%mm4, %%mm2 \n\t" \
561 "psubd %%mm4, %%mm0 \n\t" \
562 "psrad $" #shift ", %%mm2 \n\t"\
563 "psrad $" #shift ", %%mm0 \n\t"\
564 "movq %%mm6, %%mm4 \n\t" \
565 "paddd %%mm1, %%mm3 \n\t" \
566 "paddd %%mm3, %%mm6 \n\t" \
567 "psubd %%mm3, %%mm4 \n\t" \
568 "psrad $" #shift ", %%mm6 \n\t"\
569 "packssdw %%mm6, %%mm2 \n\t" \
570 "movq %%mm2, 8+" #dst " \n\t"\
571 "psrad $" #shift ", %%mm4 \n\t"\
572 "packssdw %%mm0, %%mm4 \n\t" \
573 "movq %%mm4, 16+" #dst " \n\t"\
576 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
577 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
578 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
579 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
582 #define IDCT(src0, src4, src1, src5, dst, shift) \
583 "movq " #src0 ", %%mm0 \n\t" \
584 "movq " #src4 ", %%mm1 \n\t" \
585 "movq " #src1 ", %%mm2 \n\t" \
586 "movq " #src5 ", %%mm3 \n\t" \
587 "movq 16(%2), %%mm4 \n\t" \
588 "pmaddwd %%mm0, %%mm4 \n\t" \
589 "movq 24(%2), %%mm5 \n\t" \
590 "pmaddwd %%mm5, %%mm0 \n\t" \
591 "movq 32(%2), %%mm5 \n\t" \
592 "pmaddwd %%mm1, %%mm5 \n\t" \
593 "movq 40(%2), %%mm6 \n\t" \
594 "pmaddwd %%mm6, %%mm1 \n\t" \
595 "movq %%mm4, %%mm6 \n\t" \
596 "movq 48(%2), %%mm7 \n\t" \
597 "pmaddwd %%mm2, %%mm7 \n\t" \
598 "paddd %%mm5, %%mm4 \n\t" \
599 "psubd %%mm5, %%mm6 \n\t" \
600 "movq %%mm0, %%mm5 \n\t" \
601 "paddd %%mm1, %%mm0 \n\t" \
602 "psubd %%mm1, %%mm5 \n\t" \
603 "movq 56(%2), %%mm1 \n\t" \
604 "pmaddwd %%mm3, %%mm1 \n\t" \
605 "pmaddwd 64(%2), %%mm2 \n\t" \
606 "paddd %%mm1, %%mm7 \n\t" \
607 "movq 72(%2), %%mm1 \n\t" \
608 "pmaddwd %%mm3, %%mm1 \n\t" \
609 "paddd %%mm4, %%mm7 \n\t" \
610 "paddd %%mm4, %%mm4 \n\t" \
611 "psubd %%mm7, %%mm4 \n\t" \
612 "paddd %%mm2, %%mm1 \n\t" \
613 "psrad $" #shift ", %%mm7 \n\t"\
614 "psrad $" #shift ", %%mm4 \n\t"\
615 "movq %%mm0, %%mm2 \n\t" \
616 "paddd %%mm1, %%mm0 \n\t" \
617 "psubd %%mm1, %%mm2 \n\t" \
618 "psrad $" #shift ", %%mm0 \n\t"\
619 "psrad $" #shift ", %%mm2 \n\t"\
620 "packssdw %%mm7, %%mm7 \n\t" \
621 "movd %%mm7, " #dst " \n\t"\
622 "packssdw %%mm0, %%mm0 \n\t" \
623 "movd %%mm0, 16+" #dst " \n\t"\
624 "packssdw %%mm2, %%mm2 \n\t" \
625 "movd %%mm2, 96+" #dst " \n\t"\
626 "packssdw %%mm4, %%mm4 \n\t" \
627 "movd %%mm4, 112+" #dst " \n\t"\
628 "movq " #src1 ", %%mm0 \n\t" \
629 "movq 80(%2), %%mm4 \n\t" \
630 "pmaddwd %%mm0, %%mm4 \n\t" \
631 "movq 88(%2), %%mm7 \n\t" \
632 "pmaddwd 96(%2), %%mm0 \n\t" \
633 "pmaddwd %%mm3, %%mm7 \n\t" \
634 "movq %%mm5, %%mm2 \n\t" \
635 "pmaddwd 104(%2), %%mm3 \n\t" \
636 "paddd %%mm7, %%mm4 \n\t" \
637 "paddd %%mm4, %%mm2 \n\t" \
638 "psubd %%mm4, %%mm5 \n\t" \
639 "psrad $" #shift ", %%mm2 \n\t"\
640 "psrad $" #shift ", %%mm5 \n\t"\
641 "movq %%mm6, %%mm4 \n\t" \
642 "paddd %%mm0, %%mm3 \n\t" \
643 "paddd %%mm3, %%mm6 \n\t" \
644 "psubd %%mm3, %%mm4 \n\t" \
645 "psrad $" #shift ", %%mm6 \n\t"\
646 "psrad $" #shift ", %%mm4 \n\t"\
647 "packssdw %%mm2, %%mm2 \n\t" \
648 "packssdw %%mm6, %%mm6 \n\t" \
649 "movd %%mm2, 32+" #dst " \n\t"\
650 "packssdw %%mm4, %%mm4 \n\t" \
651 "packssdw %%mm5, %%mm5 \n\t" \
652 "movd %%mm6, 48+" #dst " \n\t"\
653 "movd %%mm4, 64+" #dst " \n\t"\
654 "movd %%mm5, 80+" #dst " \n\t"
658 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
659 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
660 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
661 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
666 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
667 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
670 #define IDCT(src0, src4, src1, src5, dst, shift) \
671 "movq " #src0 ", %%mm0 \n\t" \
672 "movq " #src4 ", %%mm1 \n\t" \
673 "movq " #src5 ", %%mm3 \n\t" \
674 "movq 16(%2), %%mm4 \n\t" \
675 "pmaddwd %%mm0, %%mm4 \n\t" \
676 "movq 24(%2), %%mm5 \n\t" \
677 "pmaddwd %%mm5, %%mm0 \n\t" \
678 "movq 32(%2), %%mm5 \n\t" \
679 "pmaddwd %%mm1, %%mm5 \n\t" \
680 "movq 40(%2), %%mm6 \n\t" \
681 "pmaddwd %%mm6, %%mm1 \n\t" \
682 "movq %%mm4, %%mm6 \n\t" \
683 "paddd %%mm5, %%mm4 \n\t" \
684 "psubd %%mm5, %%mm6 \n\t" \
685 "movq %%mm0, %%mm5 \n\t" \
686 "paddd %%mm1, %%mm0 \n\t" \
687 "psubd %%mm1, %%mm5 \n\t" \
688 "movq 56(%2), %%mm1 \n\t" \
689 "pmaddwd %%mm3, %%mm1 \n\t" \
690 "movq 72(%2), %%mm7 \n\t" \
691 "pmaddwd %%mm3, %%mm7 \n\t" \
692 "paddd %%mm4, %%mm1 \n\t" \
693 "paddd %%mm4, %%mm4 \n\t" \
694 "psubd %%mm1, %%mm4 \n\t" \
695 "psrad $" #shift ", %%mm1 \n\t"\
696 "psrad $" #shift ", %%mm4 \n\t"\
697 "movq %%mm0, %%mm2 \n\t" \
698 "paddd %%mm7, %%mm0 \n\t" \
699 "psubd %%mm7, %%mm2 \n\t" \
700 "psrad $" #shift ", %%mm0 \n\t"\
701 "psrad $" #shift ", %%mm2 \n\t"\
702 "packssdw %%mm1, %%mm1 \n\t" \
703 "movd %%mm1, " #dst " \n\t"\
704 "packssdw %%mm0, %%mm0 \n\t" \
705 "movd %%mm0, 16+" #dst " \n\t"\
706 "packssdw %%mm2, %%mm2 \n\t" \
707 "movd %%mm2, 96+" #dst " \n\t"\
708 "packssdw %%mm4, %%mm4 \n\t" \
709 "movd %%mm4, 112+" #dst " \n\t"\
710 "movq 88(%2), %%mm1 \n\t" \
711 "pmaddwd %%mm3, %%mm1 \n\t" \
712 "movq %%mm5, %%mm2 \n\t" \
713 "pmaddwd 104(%2), %%mm3 \n\t" \
714 "paddd %%mm1, %%mm2 \n\t" \
715 "psubd %%mm1, %%mm5 \n\t" \
716 "psrad $" #shift ", %%mm2 \n\t"\
717 "psrad $" #shift ", %%mm5 \n\t"\
718 "movq %%mm6, %%mm1 \n\t" \
719 "paddd %%mm3, %%mm6 \n\t" \
720 "psubd %%mm3, %%mm1 \n\t" \
721 "psrad $" #shift ", %%mm6 \n\t"\
722 "psrad $" #shift ", %%mm1 \n\t"\
723 "packssdw %%mm2, %%mm2 \n\t" \
724 "packssdw %%mm6, %%mm6 \n\t" \
725 "movd %%mm2, 32+" #dst " \n\t"\
726 "packssdw %%mm1, %%mm1 \n\t" \
727 "packssdw %%mm5, %%mm5 \n\t" \
728 "movd %%mm6, 48+" #dst " \n\t"\
729 "movd %%mm1, 64+" #dst " \n\t"\
730 "movd %%mm5, 80+" #dst " \n\t"
733 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
734 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
735 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
736 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
741 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
744 #define IDCT(src0, src4, src1, src5, dst, shift) \
745 "movq " #src0 ", %%mm0 \n\t" \
746 "movq " #src5 ", %%mm3 \n\t" \
747 "movq 16(%2), %%mm4 \n\t" \
748 "pmaddwd %%mm0, %%mm4 \n\t" \
749 "movq 24(%2), %%mm5 \n\t" \
750 "pmaddwd %%mm5, %%mm0 \n\t" \
751 "movq %%mm4, %%mm6 \n\t" \
752 "movq %%mm0, %%mm5 \n\t" \
753 "movq 56(%2), %%mm1 \n\t" \
754 "pmaddwd %%mm3, %%mm1 \n\t" \
755 "movq 72(%2), %%mm7 \n\t" \
756 "pmaddwd %%mm3, %%mm7 \n\t" \
757 "paddd %%mm4, %%mm1 \n\t" \
758 "paddd %%mm4, %%mm4 \n\t" \
759 "psubd %%mm1, %%mm4 \n\t" \
760 "psrad $" #shift ", %%mm1 \n\t"\
761 "psrad $" #shift ", %%mm4 \n\t"\
762 "movq %%mm0, %%mm2 \n\t" \
763 "paddd %%mm7, %%mm0 \n\t" \
764 "psubd %%mm7, %%mm2 \n\t" \
765 "psrad $" #shift ", %%mm0 \n\t"\
766 "psrad $" #shift ", %%mm2 \n\t"\
767 "packssdw %%mm1, %%mm1 \n\t" \
768 "movd %%mm1, " #dst " \n\t"\
769 "packssdw %%mm0, %%mm0 \n\t" \
770 "movd %%mm0, 16+" #dst " \n\t"\
771 "packssdw %%mm2, %%mm2 \n\t" \
772 "movd %%mm2, 96+" #dst " \n\t"\
773 "packssdw %%mm4, %%mm4 \n\t" \
774 "movd %%mm4, 112+" #dst " \n\t"\
775 "movq 88(%2), %%mm1 \n\t" \
776 "pmaddwd %%mm3, %%mm1 \n\t" \
777 "movq %%mm5, %%mm2 \n\t" \
778 "pmaddwd 104(%2), %%mm3 \n\t" \
779 "paddd %%mm1, %%mm2 \n\t" \
780 "psubd %%mm1, %%mm5 \n\t" \
781 "psrad $" #shift ", %%mm2 \n\t"\
782 "psrad $" #shift ", %%mm5 \n\t"\
783 "movq %%mm6, %%mm1 \n\t" \
784 "paddd %%mm3, %%mm6 \n\t" \
785 "psubd %%mm3, %%mm1 \n\t" \
786 "psrad $" #shift ", %%mm6 \n\t"\
787 "psrad $" #shift ", %%mm1 \n\t"\
788 "packssdw %%mm2, %%mm2 \n\t" \
789 "packssdw %%mm6, %%mm6 \n\t" \
790 "movd %%mm2, 32+" #dst " \n\t"\
791 "packssdw %%mm1, %%mm1 \n\t" \
792 "packssdw %%mm5, %%mm5 \n\t" \
793 "movd %%mm6, 48+" #dst " \n\t"\
794 "movd %%mm1, 64+" #dst " \n\t"\
795 "movd %%mm5, 80+" #dst " \n\t"
799 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
800 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
801 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
802 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
807 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
810 #define IDCT(src0, src4, src1, src5, dst, shift) \
811 "movq " #src0 ", %%mm0 \n\t" \
812 "movq " #src1 ", %%mm2 \n\t" \
813 "movq " #src5 ", %%mm3 \n\t" \
814 "movq 16(%2), %%mm4 \n\t" \
815 "pmaddwd %%mm0, %%mm4 \n\t" \
816 "movq 24(%2), %%mm5 \n\t" \
817 "pmaddwd %%mm5, %%mm0 \n\t" \
818 "movq %%mm4, %%mm6 \n\t" \
819 "movq 48(%2), %%mm7 \n\t" \
820 "pmaddwd %%mm2, %%mm7 \n\t" \
821 "movq %%mm0, %%mm5 \n\t" \
822 "movq 56(%2), %%mm1 \n\t" \
823 "pmaddwd %%mm3, %%mm1 \n\t" \
824 "pmaddwd 64(%2), %%mm2 \n\t" \
825 "paddd %%mm1, %%mm7 \n\t" \
826 "movq 72(%2), %%mm1 \n\t" \
827 "pmaddwd %%mm3, %%mm1 \n\t" \
828 "paddd %%mm4, %%mm7 \n\t" \
829 "paddd %%mm4, %%mm4 \n\t" \
830 "psubd %%mm7, %%mm4 \n\t" \
831 "paddd %%mm2, %%mm1 \n\t" \
832 "psrad $" #shift ", %%mm7 \n\t"\
833 "psrad $" #shift ", %%mm4 \n\t"\
834 "movq %%mm0, %%mm2 \n\t" \
835 "paddd %%mm1, %%mm0 \n\t" \
836 "psubd %%mm1, %%mm2 \n\t" \
837 "psrad $" #shift ", %%mm0 \n\t"\
838 "psrad $" #shift ", %%mm2 \n\t"\
839 "packssdw %%mm7, %%mm7 \n\t" \
840 "movd %%mm7, " #dst " \n\t"\
841 "packssdw %%mm0, %%mm0 \n\t" \
842 "movd %%mm0, 16+" #dst " \n\t"\
843 "packssdw %%mm2, %%mm2 \n\t" \
844 "movd %%mm2, 96+" #dst " \n\t"\
845 "packssdw %%mm4, %%mm4 \n\t" \
846 "movd %%mm4, 112+" #dst " \n\t"\
847 "movq " #src1 ", %%mm0 \n\t" \
848 "movq 80(%2), %%mm4 \n\t" \
849 "pmaddwd %%mm0, %%mm4 \n\t" \
850 "movq 88(%2), %%mm7 \n\t" \
851 "pmaddwd 96(%2), %%mm0 \n\t" \
852 "pmaddwd %%mm3, %%mm7 \n\t" \
853 "movq %%mm5, %%mm2 \n\t" \
854 "pmaddwd 104(%2), %%mm3 \n\t" \
855 "paddd %%mm7, %%mm4 \n\t" \
856 "paddd %%mm4, %%mm2 \n\t" \
857 "psubd %%mm4, %%mm5 \n\t" \
858 "psrad $" #shift ", %%mm2 \n\t"\
859 "psrad $" #shift ", %%mm5 \n\t"\
860 "movq %%mm6, %%mm4 \n\t" \
861 "paddd %%mm0, %%mm3 \n\t" \
862 "paddd %%mm3, %%mm6 \n\t" \
863 "psubd %%mm3, %%mm4 \n\t" \
864 "psrad $" #shift ", %%mm6 \n\t"\
865 "psrad $" #shift ", %%mm4 \n\t"\
866 "packssdw %%mm2, %%mm2 \n\t" \
867 "packssdw %%mm6, %%mm6 \n\t" \
868 "movd %%mm2, 32+" #dst " \n\t"\
869 "packssdw %%mm4, %%mm4 \n\t" \
870 "packssdw %%mm5, %%mm5 \n\t" \
871 "movd %%mm6, 48+" #dst " \n\t"\
872 "movd %%mm4, 64+" #dst " \n\t"\
873 "movd %%mm5, 80+" #dst " \n\t"
876 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
877 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
878 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
879 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
885 #define IDCT(src0, src4, src1, src5, dst, shift) \
886 "movq " #src0 ", %%mm0 \n\t" \
887 "movq " #src1 ", %%mm2 \n\t" \
888 "movq 16(%2), %%mm4 \n\t" \
889 "pmaddwd %%mm0, %%mm4 \n\t" \
890 "movq 24(%2), %%mm5 \n\t" \
891 "pmaddwd %%mm5, %%mm0 \n\t" \
892 "movq %%mm4, %%mm6 \n\t" \
893 "movq 48(%2), %%mm7 \n\t" \
894 "pmaddwd %%mm2, %%mm7 \n\t" \
895 "movq %%mm0, %%mm5 \n\t" \
896 "movq 64(%2), %%mm3 \n\t"\
897 "pmaddwd %%mm2, %%mm3 \n\t" \
898 "paddd %%mm4, %%mm7 \n\t" \
899 "paddd %%mm4, %%mm4 \n\t" \
900 "psubd %%mm7, %%mm4 \n\t" \
901 "psrad $" #shift ", %%mm7 \n\t"\
902 "psrad $" #shift ", %%mm4 \n\t"\
903 "movq %%mm0, %%mm1 \n\t" \
904 "paddd %%mm3, %%mm0 \n\t" \
905 "psubd %%mm3, %%mm1 \n\t" \
906 "psrad $" #shift ", %%mm0 \n\t"\
907 "psrad $" #shift ", %%mm1 \n\t"\
908 "packssdw %%mm7, %%mm7 \n\t" \
909 "movd %%mm7, " #dst " \n\t"\
910 "packssdw %%mm0, %%mm0 \n\t" \
911 "movd %%mm0, 16+" #dst " \n\t"\
912 "packssdw %%mm1, %%mm1 \n\t" \
913 "movd %%mm1, 96+" #dst " \n\t"\
914 "packssdw %%mm4, %%mm4 \n\t" \
915 "movd %%mm4, 112+" #dst " \n\t"\
916 "movq 80(%2), %%mm4 \n\t" \
917 "pmaddwd %%mm2, %%mm4 \n\t" \
918 "pmaddwd 96(%2), %%mm2 \n\t" \
919 "movq %%mm5, %%mm1 \n\t" \
920 "paddd %%mm4, %%mm1 \n\t" \
921 "psubd %%mm4, %%mm5 \n\t" \
922 "psrad $" #shift ", %%mm1 \n\t"\
923 "psrad $" #shift ", %%mm5 \n\t"\
924 "movq %%mm6, %%mm4 \n\t" \
925 "paddd %%mm2, %%mm6 \n\t" \
926 "psubd %%mm2, %%mm4 \n\t" \
927 "psrad $" #shift ", %%mm6 \n\t"\
928 "psrad $" #shift ", %%mm4 \n\t"\
929 "packssdw %%mm1, %%mm1 \n\t" \
930 "packssdw %%mm6, %%mm6 \n\t" \
931 "movd %%mm1, 32+" #dst " \n\t"\
932 "packssdw %%mm4, %%mm4 \n\t" \
933 "packssdw %%mm5, %%mm5 \n\t" \
934 "movd %%mm6, 48+" #dst " \n\t"\
935 "movd %%mm4, 64+" #dst " \n\t"\
936 "movd %%mm5, 80+" #dst " \n\t"
940 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
941 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
942 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
943 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
949 #define IDCT(src0, src4, src1, src5, dst, shift) \
950 "movq " #src0 ", %%mm0 \n\t" \
951 "movq " #src4 ", %%mm1 \n\t" \
952 "movq 16(%2), %%mm4 \n\t" \
953 "pmaddwd %%mm0, %%mm4 \n\t" \
954 "movq 24(%2), %%mm5 \n\t" \
955 "pmaddwd %%mm5, %%mm0 \n\t" \
956 "movq 32(%2), %%mm5 \n\t" \
957 "pmaddwd %%mm1, %%mm5 \n\t" \
958 "movq 40(%2), %%mm6 \n\t" \
959 "pmaddwd %%mm6, %%mm1 \n\t" \
960 "movq %%mm4, %%mm6 \n\t" \
961 "paddd %%mm5, %%mm4 \n\t" \
962 "psubd %%mm5, %%mm6 \n\t" \
963 "movq %%mm0, %%mm5 \n\t" \
964 "paddd %%mm1, %%mm0 \n\t" \
965 "psubd %%mm1, %%mm5 \n\t" \
966 "movq 8+" #src0 ", %%mm2 \n\t" \
967 "movq 8+" #src4 ", %%mm3 \n\t" \
968 "movq 16(%2), %%mm1 \n\t" \
969 "pmaddwd %%mm2, %%mm1 \n\t" \
970 "movq 24(%2), %%mm7 \n\t" \
971 "pmaddwd %%mm7, %%mm2 \n\t" \
972 "movq 32(%2), %%mm7 \n\t" \
973 "pmaddwd %%mm3, %%mm7 \n\t" \
974 "pmaddwd 40(%2), %%mm3 \n\t" \
975 "paddd %%mm1, %%mm7 \n\t" \
976 "paddd %%mm1, %%mm1 \n\t" \
977 "psubd %%mm7, %%mm1 \n\t" \
978 "paddd %%mm2, %%mm3 \n\t" \
979 "paddd %%mm2, %%mm2 \n\t" \
980 "psubd %%mm3, %%mm2 \n\t" \
981 "psrad $" #shift ", %%mm4 \n\t"\
982 "psrad $" #shift ", %%mm7 \n\t"\
983 "psrad $" #shift ", %%mm3 \n\t"\
984 "packssdw %%mm7, %%mm4 \n\t" \
985 "movq %%mm4, " #dst " \n\t"\
986 "psrad $" #shift ", %%mm0 \n\t"\
987 "packssdw %%mm3, %%mm0 \n\t" \
988 "movq %%mm0, 16+" #dst " \n\t"\
989 "movq %%mm0, 96+" #dst " \n\t"\
990 "movq %%mm4, 112+" #dst " \n\t"\
991 "psrad $" #shift ", %%mm5 \n\t"\
992 "psrad $" #shift ", %%mm6 \n\t"\
993 "psrad $" #shift ", %%mm2 \n\t"\
994 "packssdw %%mm2, %%mm5 \n\t" \
995 "movq %%mm5, 32+" #dst " \n\t"\
996 "psrad $" #shift ", %%mm1 \n\t"\
997 "packssdw %%mm1, %%mm6 \n\t" \
998 "movq %%mm6, 48+" #dst " \n\t"\
999 "movq %%mm6, 64+" #dst " \n\t"\
1000 "movq %%mm5, 80+" #dst " \n\t"
1004 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1006 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1011 "
# .p2align 4 \n\t"\
1014 #define IDCT(src0, src4, src1, src5, dst, shift) \
1015 "movq " #src0 ", %%mm0 \n\t" \
1016 "movq " #src4 ", %%mm1 \n\t" \
1017 "movq " #src1 ", %%mm2 \n\t" \
1018 "movq 16(%2), %%mm4 \n\t" \
1019 "pmaddwd %%mm0, %%mm4 \n\t" \
1020 "movq 24(%2), %%mm5 \n\t" \
1021 "pmaddwd %%mm5, %%mm0 \n\t" \
1022 "movq 32(%2), %%mm5 \n\t" \
1023 "pmaddwd %%mm1, %%mm5 \n\t" \
1024 "movq 40(%2), %%mm6 \n\t" \
1025 "pmaddwd %%mm6, %%mm1 \n\t" \
1026 "movq %%mm4, %%mm6 \n\t" \
1027 "movq 48(%2), %%mm7 \n\t" \
1028 "pmaddwd %%mm2, %%mm7 \n\t" \
1029 "paddd %%mm5, %%mm4 \n\t" \
1030 "psubd %%mm5, %%mm6 \n\t" \
1031 "movq %%mm0, %%mm5 \n\t" \
1032 "paddd %%mm1, %%mm0 \n\t" \
1033 "psubd %%mm1, %%mm5 \n\t" \
1034 "movq 64(%2), %%mm1 \n\t"\
1035 "pmaddwd %%mm2, %%mm1 \n\t" \
1036 "paddd %%mm4, %%mm7 \n\t" \
1037 "paddd %%mm4, %%mm4 \n\t" \
1038 "psubd %%mm7, %%mm4 \n\t" \
1039 "psrad $" #shift ", %%mm7 \n\t"\
1040 "psrad $" #shift ", %%mm4 \n\t"\
1041 "movq %%mm0, %%mm3 \n\t" \
1042 "paddd %%mm1, %%mm0 \n\t" \
1043 "psubd %%mm1, %%mm3 \n\t" \
1044 "psrad $" #shift ", %%mm0 \n\t"\
1045 "psrad $" #shift ", %%mm3 \n\t"\
1046 "packssdw %%mm7, %%mm7 \n\t" \
1047 "movd %%mm7, " #dst " \n\t"\
1048 "packssdw %%mm0, %%mm0 \n\t" \
1049 "movd %%mm0, 16+" #dst " \n\t"\
1050 "packssdw %%mm3, %%mm3 \n\t" \
1051 "movd %%mm3, 96+" #dst " \n\t"\
1052 "packssdw %%mm4, %%mm4 \n\t" \
1053 "movd %%mm4, 112+" #dst " \n\t"\
1054 "movq 80(%2), %%mm4 \n\t" \
1055 "pmaddwd %%mm2, %%mm4 \n\t" \
1056 "pmaddwd 96(%2), %%mm2 \n\t" \
1057 "movq %%mm5, %%mm3 \n\t" \
1058 "paddd %%mm4, %%mm3 \n\t" \
1059 "psubd %%mm4, %%mm5 \n\t" \
1060 "psrad $" #shift ", %%mm3 \n\t"\
1061 "psrad $" #shift ", %%mm5 \n\t"\
1062 "movq %%mm6, %%mm4 \n\t" \
1063 "paddd %%mm2, %%mm6 \n\t" \
1064 "psubd %%mm2, %%mm4 \n\t" \
1065 "psrad $" #shift ", %%mm6 \n\t"\
1066 "packssdw %%mm3, %%mm3 \n\t" \
1067 "movd %%mm3, 32+" #dst " \n\t"\
1068 "psrad $" #shift ", %%mm4 \n\t"\
1069 "packssdw %%mm6, %%mm6 \n\t" \
1070 "movd %%mm6, 48+" #dst " \n\t"\
1071 "packssdw %%mm4, %%mm4 \n\t" \
1072 "packssdw %%mm5, %%mm5 \n\t" \
1073 "movd %%mm4, 64+" #dst " \n\t"\
1074 "movd %%mm5, 80+" #dst " \n\t"
1078 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1079 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1080 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1081 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1088 #define IDCT(src0, src4, src1, src5, dst, shift) \
1089 "movq " #src0 ", %%mm0 \n\t" \
1090 "movq 16(%2), %%mm4 \n\t" \
1091 "pmaddwd %%mm0, %%mm4 \n\t" \
1092 "movq 24(%2), %%mm5 \n\t" \
1093 "pmaddwd %%mm5, %%mm0 \n\t" \
1094 "psrad $" #shift ", %%mm4 \n\t"\
1095 "psrad $" #shift ", %%mm0 \n\t"\
1096 "movq 8+" #src0 ", %%mm2 \n\t" \
1097 "movq 16(%2), %%mm1 \n\t" \
1098 "pmaddwd %%mm2, %%mm1 \n\t" \
1099 "movq 24(%2), %%mm7 \n\t" \
1100 "pmaddwd %%mm7, %%mm2 \n\t" \
1101 "movq 32(%2), %%mm7 \n\t" \
1102 "psrad $" #shift ", %%mm1 \n\t"\
1103 "packssdw %%mm1, %%mm4 \n\t" \
1104 "movq %%mm4, " #dst " \n\t"\
1105 "psrad $" #shift ", %%mm2 \n\t"\
1106 "packssdw %%mm2, %%mm0 \n\t" \
1107 "movq %%mm0, 16+" #dst " \n\t"\
1108 "movq %%mm0, 96+" #dst " \n\t"\
1109 "movq %%mm4, 112+" #dst " \n\t"\
1110 "movq %%mm0, 32+" #dst " \n\t"\
1111 "movq %%mm4, 48+" #dst " \n\t"\
1112 "movq %%mm4, 64+" #dst " \n\t"\
1113 "movq %%mm0, 80+" #dst " \n\t"
1116 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1118 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1147 ::
"r" (
block),
"r" (temp),
"r" (coeffs)
#define DECLARE_ALIGNED(n, t, v)
#define DECLARE_ASM_CONST(n, t, v)
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
common internal API header
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
void ff_simple_idct_mmx(int16_t *block)
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)