SOURCES: ffmpeg-gcc4.patch (NEW) - use mmx intrinsincs. compiler...

Mon Jan 30 03:12:39 CET 2006

Author: pluto                        Date: Mon Jan 30 02:12:39 2006 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- use mmx intrinsincs.
  compiler knowns better than the hardcoded assembly how to
  optimally allocate registers and schedule instructions.

---- Files affected:
SOURCES:
   ffmpeg-gcc4.patch (1.2 -> 1.3)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/ffmpeg-gcc4.patch
diff -u /dev/null SOURCES/ffmpeg-gcc4.patch:1.3

--- /dev/null	Mon Jan 30 03:12:39 2006
+++ SOURCES/ffmpeg-gcc4.patch	Mon Jan 30 03:12:34 2006
@@ -0,0 +1,42 @@
+--- ffmpeg/libavcodec/i386/dsputil_mmx.c.orig	2006-01-12 22:43:17.000000000 +0000
++++ ffmpeg/libavcodec/i386/dsputil_mmx.c	2006-01-30 01:42:21.087254880 +0000
+@@ -20,6 +20,7 @@
+  * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+  */
+ 
++#include <mmintrin.h>
+ #include "../dsputil.h"
+ #include "../simple_idct.h"
+ #include "../mpegvideo.h"
+@@ -617,7 +618,23 @@
+     );
+ }
+ 
++#if (__GNUC__ >= 4)
+ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
++    __m64 row0 = _mm_cvtsi32_si64(*(unsigned*)(src + (0 * src_stride)));
++    __m64 row1 = _mm_cvtsi32_si64(*(unsigned*)(src + (1 * src_stride)));
++    __m64 row2 = _mm_cvtsi32_si64(*(unsigned*)(src + (2 * src_stride)));
++    __m64 row3 = _mm_cvtsi32_si64(*(unsigned*)(src + (3 * src_stride)));
++    __m64 tmp0 = _mm_unpacklo_pi8(row0, row1);
++    __m64 tmp1 = _mm_unpacklo_pi8(row2, row3);
++    __m64 row01 = _mm_unpacklo_pi16(tmp0, tmp1);
++    __m64 row23 = _mm_unpackhi_pi16(tmp0, tmp1);
++    *((unsigned*)(dst + (0 * dst_stride))) = _mm_cvtsi64_si32(row01);
++    *((unsigned*)(dst + (1 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row01, row01));
++    *((unsigned*)(dst + (2 * dst_stride))) = _mm_cvtsi64_si32(row23);
++    *((unsigned*)(dst + (3 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row23, row23));
++}
++#else
++static void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+     asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
+         "movd  %4, %%mm0                \n\t"
+         "movd  %5, %%mm1                \n\t"
+@@ -645,6 +662,7 @@
+            "m" (*(uint32_t*)(src + 3*src_stride))
+     );
+ }
++#endif
+ 
+ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+     const int strength= ff_h263_loop_filter_strength[qscale];
================================================================