[packages/xvid] remove textrel patch dropped from spec in 9731c6c

atler atler at pld-linux.org
Wed Nov 26 00:09:51 CET 2025


commit 78e2c6266c54c7c18ad4c62bbf9a0831dbf7e49f
Author: Jan Palus <atler at pld-linux.org>
Date:   Wed Nov 26 00:01:17 2025 +0100

    remove textrel patch dropped from spec in 9731c6c

 xvid-1.1.2-textrel.patch | 5757 ----------------------------------------------
 1 file changed, 5757 deletions(-)
---
diff --git a/xvid-1.1.2-textrel.patch b/xvid-1.1.2-textrel.patch
deleted file mode 100644
index 2d3d773..0000000
--- a/xvid-1.1.2-textrel.patch
+++ /dev/null
@@ -1,5757 +0,0 @@
-diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm
---- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -50,23 +50,6 @@ BITS 32
- %endmacro
- 
- ;=============================================================================
--; Local data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--
--mult_mask:
--  db 0x10,0x20,0x04,0x08,0x01,0x02,0x00,0x00
--ignore_dc:
--  dw 0, -1, -1, -1
--
--;=============================================================================
- ; Code
- ;=============================================================================
- 
-@@ -91,7 +74,12 @@ ALIGN 16
- calc_cbp_mmx:
-   mov eax, [esp + 4]            ; coeff
- 
--  movq mm7, [ignore_dc]
-+  push byte 0                 ; align esp to 8 bytes
-+  push byte -1
-+  push dword 0xFFFF0000
-+  movq mm7, [esp]
-+  add esp, byte 8
-+
-   pxor mm6, mm6                ; used only for comparing
-   movq mm0, [eax+128*0]
-   movq mm1, [eax+128*1]
-@@ -123,7 +111,11 @@ calc_cbp_mmx:
-   MAKE_LOAD 13
-   MAKE_LOAD 14
- 
--  movq mm7, [mult_mask]
-+  push dword 0x00000201
-+  push dword 0x08042010
-+  movq mm7, [esp]
-+  add esp, byte 12
-+
-   packssdw mm0, mm1
-   packssdw mm2, mm3
-   packssdw mm4, mm5
-diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm
---- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -69,20 +69,6 @@ BITS 32
- %endmacro
- 
- ;=============================================================================
--; Data (Read Only)
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--ignore_dc:
--  dw 0, -1, -1, -1, -1, -1, -1, -1
--
--;=============================================================================
- ; Code
- ;=============================================================================
- 
-@@ -98,7 +84,13 @@ calc_cbp_sse2:
-   mov edx, [esp+4]         ; coeff[]
-   xor eax, eax             ; cbp = 0
- 
--  movdqu xmm7, [ignore_dc] ; mask to ignore dc value
-+  sub esp,byte 12          ; align esp to 16 bytes
-+  push byte -1
-+  push byte -1
-+  push byte -1
-+  push dword 0xFFFF0000
-+  movdqu xmm7, [esp]       ; mask to ignore dc value
-+  add esp, byte 28
-   pxor xmm6, xmm6          ; zero
- 
-   LOOP_SSE2 0
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -204,7 +204,7 @@ fdct_r_row:
-   psllw mm4, SHIFT_FRW_COL
-   movq mm6, mm0
-   psubsw mm2, mm1
--  movq mm1, [fdct_tg_all_16 + 4*2]
-+  movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
-   psubsw mm0, mm4
-   movq mm7, [%2 + %3*2 + 3*16]
-   pmulhw mm1, mm0
-@@ -216,9 +216,9 @@ fdct_r_row:
-   psubsw mm5, mm7
-   paddsw mm1, mm5
-   paddsw mm4, mm7
--  por mm1, [fdct_one_corr]
-+  por mm1, [ebx + fdct_one_corr wrt ..gotoff]
-   psllw mm2, SHIFT_FRW_COL + 1
--  pmulhw mm5, [fdct_tg_all_16 + 4*2]
-+  pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
-   movq mm7, mm4
-   psubsw mm3, [%2 + %3*2 + 5*16]
-   psubsw mm4, mm6
-@@ -230,34 +230,34 @@ fdct_r_row:
-   movq mm6, mm2
-   movq [%1 + %3*2 + 4*16], mm4
-   paddsw mm2, mm3
--  pmulhw mm2, [ocos_4_16]
-+  pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff]
-   psubsw mm6, mm3
--  pmulhw mm6, [ocos_4_16]
-+  pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff]
-   psubsw mm5, mm0
--  por mm5, [fdct_one_corr]
-+  por mm5, [ebx + fdct_one_corr wrt ..gotoff]
-   psllw mm1, SHIFT_FRW_COL
--  por mm2, [fdct_one_corr]
-+  por mm2, [ebx + fdct_one_corr wrt ..gotoff]
-   movq mm4, mm1
-   movq mm3, [%2 + %3*2 + 0*16]
-   paddsw mm1, mm6
-   psubsw mm3, [%2 + %3*2 + 7*16]
-   psubsw mm4, mm6
--  movq mm0, [fdct_tg_all_16 + 0*2]
-+  movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
-   psllw mm3, SHIFT_FRW_COL
--  movq mm6, [fdct_tg_all_16 + 8*2]
-+  movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
-   pmulhw mm0, mm1
-   movq [%1 + %3*2 + 0*16], mm7
-   pmulhw mm6, mm4
-   movq [%1 + %3*2 + 6*16], mm5
-   movq mm7, mm3
--  movq mm5, [fdct_tg_all_16 + 8*2]
-+  movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
-   psubsw mm7, mm2
-   paddsw mm3, mm2
-   pmulhw mm5, mm7
-   paddsw mm0, mm3
-   paddsw mm6, mm4
--  pmulhw mm3, [fdct_tg_all_16 + 0*2]
--  por mm0, [fdct_one_corr]
-+  pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
-+  por mm0, [ebx + fdct_one_corr wrt ..gotoff]
-   paddsw mm5, mm7
-   psubsw mm7, mm6
-   movq [%1 + %3*2 + 1*16], mm0
-@@ -287,28 +287,28 @@ fdct_r_row:
-   movq mm6, mm5
-   punpckldq mm3, mm5
-   punpckhdq mm6, mm3
--  movq mm3, [%3 + 0*2]
--  movq mm4, [%3 + 4*2]
-+  movq mm3, [0*2 + %3]
-+  movq mm4, [4*2 + %3]
-   punpckldq mm2, mm0
-   pmaddwd mm3, mm0
-   punpckhdq mm1, mm2
--  movq mm2, [%3 + 16*2]
-+  movq mm2, [16*2 + %3]
-   pmaddwd mm4, mm1
--  pmaddwd mm0, [%3 + 8*2]
--  movq mm7, [%3 + 20*2]
-+  pmaddwd mm0, [8*2 + %3]
-+  movq mm7, [20*2 + %3]
-   pmaddwd mm2, mm5
--  paddd mm3, [fdct_r_row]
-+  paddd mm3, [ebx + fdct_r_row wrt ..gotoff]
-   pmaddwd mm7, mm6
--  pmaddwd mm1, [%3 + 12*2]
-+  pmaddwd mm1, [12*2 + %3]
-   paddd mm3, mm4
--  pmaddwd mm5, [%3 + 24*2]
--  pmaddwd mm6, [%3 + 28*2]
-+  pmaddwd mm5, [24*2 + %3]
-+  pmaddwd mm6, [28*2 + %3]
-   paddd mm2, mm7
--  paddd mm0, [fdct_r_row]
-+  paddd mm0, [ebx + fdct_r_row wrt ..gotoff]
-   psrad mm3, SHIFT_FRW_ROW
--  paddd mm2, [fdct_r_row]
-+  paddd mm2, [ebx + fdct_r_row wrt ..gotoff]
-   paddd mm0, mm1
--  paddd mm5, [fdct_r_row]
-+  paddd mm5, [ebx + fdct_r_row wrt ..gotoff]
-   psrad mm2, SHIFT_FRW_ROW
-   paddd mm5, mm6
-   psrad mm0, SHIFT_FRW_ROW
-@@ -336,23 +336,23 @@ fdct_r_row:
-   psubsw mm1, mm5
-   pshufw mm2, mm0, 0x4E
-   pshufw mm3, mm1, 0x4E
--  movq mm4, [%3 +  0*2]
--  movq mm6, [%3 +  4*2]
--  movq mm5, [%3 + 16*2]
--  movq mm7, [%3 + 20*2]
-+  movq mm4, [ 0*2 + %3]
-+  movq mm6, [ 4*2 + %3]
-+  movq mm5, [16*2 + %3]
-+  movq mm7, [20*2 + %3]
-   pmaddwd mm4, mm0
-   pmaddwd mm5, mm1
-   pmaddwd mm6, mm2
-   pmaddwd mm7, mm3
--  pmaddwd mm0, [%3 +  8*2]
--  pmaddwd mm2, [%3 + 12*2]
--  pmaddwd mm1, [%3 + 24*2]
--  pmaddwd mm3, [%3 + 28*2]
-+  pmaddwd mm0, [ 8*2 + %3]
-+  pmaddwd mm2, [12*2 + %3]
-+  pmaddwd mm1, [24*2 + %3]
-+  pmaddwd mm3, [28*2 + %3]
-   paddd mm4, mm6
-   paddd mm5, mm7
-   paddd mm0, mm2
-   paddd mm1, mm3
--  movq mm7, [fdct_r_row]
-+  movq mm7, [ebx + fdct_r_row wrt ..gotoff]
-   paddd mm4, mm7
-   paddd mm5, mm7
-   paddd mm0, mm7
-@@ -377,6 +377,10 @@ cglobal %1
- 	;; Move the destination/source address to the eax register
-   mov eax, [esp + 4]
- 
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- 	;; Process the columns (4 at a time)
-   FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
-   FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
-@@ -386,12 +390,12 @@ cglobal %1
- %assign i 0
- %rep 8
- 	;; Process the 'i'th row
--  %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
-+  %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff
- 	%assign i i+1
- %endrep
- %else
-   mov ecx, 8
--  mov edx, tab_frw_01234567
-+  mov edx, [ebx + tab_frw_01234567 wrt ..gotoff]
- ALIGN 8
- .loop
-   %2 eax, eax, edx
-@@ -401,6 +405,7 @@ ALIGN 8
-   jne .loop
- %endif
- 
-+  pop ebx
-   ret
- .endfunc
- %endmacro
-@@ -411,6 +416,11 @@ ALIGN 8
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+  mov ebx, [esp]
-+  retn
-+
- ;-----------------------------------------------------------------------------
- ; void fdct_mmx_ffmpeg(int16_t block[64]);
- ;-----------------------------------------------------------------------------
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -294,15 +294,15 @@ MMX_One:
-   paddsw mm2, mm1       ; mm2: t6+t5
-   movq [%1+0*16], mm5   ; => out0
- 
--  movq mm4, [tan2]      ; mm4 <= tan2
-+  movq mm4, [ebx + tan2 wrt ..gotoff]      ; mm4 <= tan2
-   pmulhw mm4, mm7       ; tm03*tan2
--  movq mm5, [tan2]      ; mm5 <= tan2
-+  movq mm5, [ebx + tan2 wrt ..gotoff]      ; mm5 <= tan2
-   psubsw mm4, mm6       ; out6 = tm03*tan2 - tm12
-   pmulhw mm5, mm6       ; tm12*tan2
-   paddsw mm5, mm7       ; out2 = tm12*tan2 + tm03
- 
--  movq mm6, [sqrt2]
--  movq mm7, [MMX_One]
-+  movq mm6, [ebx + sqrt2 wrt ..gotoff]
-+  movq mm7, [ebx + MMX_One wrt ..gotoff]
- 
-   pmulhw mm2, mm6       ; mm2: tp65 = (t6 + t5)*cos4
-   por mm5, mm7          ; correct out2
-@@ -320,8 +320,8 @@ MMX_One:
-   paddsw mm2, mm4       ; mm2: tp765 = t7 + tp65
-   paddsw mm1, mm5       ; mm1: tp465 = t4 + tm65
- 
--  movq mm4, [tan3]      ; tan3 - 1
--  movq mm5, [tan1]      ; tan1
-+  movq mm4, [ebx + tan3 wrt ..gotoff]      ; tan3 - 1
-+  movq mm5, [ebx + tan1 wrt ..gotoff]      ; tan1
- 
-   movq mm7, mm3         ; save tm465
-   pmulhw mm3, mm4       ; tm465*(tan3-1)
-@@ -364,23 +364,23 @@ MMX_One:
-   punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
-   punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
- 
--  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]
--  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]
-+  movq mm2, qword [0 + %3]     ;  [   M00    M01      M16    M17]
-+  movq mm3, qword [8 + %3]     ;  [   M02    M03      M18    M19]
-   pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
--  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]
-+  movq mm4, qword [16 + %3]    ;  [   M04    M05      M20    M21]
-   pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
--  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]
-+  movq mm5, qword [24 + %3]    ;  [   M06    M07      M22    M23]
-   pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
--  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]
-+  movq mm6, qword [32 + %3]    ;  [   M08    M09      M24    M25]
-   pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
--  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]
-+  movq mm7, qword [40 + %3]    ;  [   M10    M11      M26    M27]
-   pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
-   paddd mm2, mm3               ;  [ out0 | out1 ]
-   pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
-   psrad mm2, 16
--  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
-+  pmaddwd mm0, qword [48 + %3] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
-   paddd mm4, mm5               ;  [ out2 | out3 ]
--  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
-+  pmaddwd mm1, qword [56 + %3] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
-   psrad mm4, 16
- 
-   paddd mm6, mm7               ;  [ out4 | out5 ]
-@@ -422,23 +422,23 @@ MMX_One:
-   punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
-   punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
- 
--  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]
--  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]
-+  movq mm2, qword [0 + %3]     ;  [   M00    M01      M16    M17]
-+  movq mm3, qword [8 + %3]     ;  [   M02    M03      M18    M19]
-   pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
--  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]
-+  movq mm4, qword [16 + %3]    ;  [   M04    M05      M20    M21]
-   pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
--  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]
-+  movq mm5, qword [24 + %3]    ;  [   M06    M07      M22    M23]
-   pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
--  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]
-+  movq mm6, qword [32 + %3]    ;  [   M08    M09      M24    M25]
-   pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
--  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]
-+  movq mm7, qword [40 + %3]    ;  [   M10    M11      M26    M27]
-   pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
-   paddd mm2, mm3               ;  [ out0 | out1 ]
-   pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
-   psrad mm2, 16
--  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
-+  pmaddwd mm0, qword [48 + %3] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
-   paddd mm4, mm5               ;  [ out2 | out3 ]
--  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
-+  pmaddwd mm1, qword [56 + %3] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
-   psrad mm4, 16
- 
-   paddd mm6, mm7               ;  [ out4 | out5 ]
-@@ -467,12 +467,16 @@ MMX_One:
- ALIGN 16
- cglobal %1
- %1:
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- %ifdef UNROLLED_LOOP
--  mov ecx, [esp + 4]
-+  mov ecx, [esp + 4 + 4]
- %else
--  push ebx
-+  push esi
-   push edi
--  mov ecx, [esp + 8 + 4]
-+  mov ecx, [esp + 12 + 4]
- %endif
- 
-   fLLM_PASS ecx+0, ecx+0, 3
-@@ -481,27 +485,28 @@ cglobal %1
- %ifdef UNROLLED_LOOP
- %assign i 0
- %rep 8
--  %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8
-+  %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff
- 	%assign i i+1
- %endrep
- %else
-   mov eax, 8
--  mov edx, fdct_table
--  mov ebx, fdct_rounding_1
--  mov edi, fdct_rounding_2
-+  lea edx, [ebx + fdct_table wrt ..gotoff]
-+  lea esi, [ebx + fdct_rounding_1 wrt ..gotoff]
-+  lea edi, [ebx + fdct_rounding_2 wrt ..gotoff]
- .loop
--  %2 ecx, ecx, edx, ebx, edi
-+  %2 ecx, ecx, edx, esi, edi
-   add ecx, 2*8
-   add edx, 2*32
--  add ebx, 2*4
-+  add esi, 2*4
-   add edi, 2*4
-   dec eax
-   jne .loop
- 
-   pop edi
--  pop ebx
-+  pop esi
- %endif
- 
-+  pop ebx
-   ret
- .endfunc
- %endmacro
-@@ -512,6 +517,11 @@ cglobal %1
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+  mov ebx, [esp]
-+  retn
-+
- ;-----------------------------------------------------------------------------
- ; void fdct_mmx_skal(int16_t block[64]];
- ;-----------------------------------------------------------------------------
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -238,10 +238,10 @@ cglobal fdct_sse2_skal
-   pshufd  xmm6, xmm0, 01010101b ; [13131313]
-   pshufd  xmm7, xmm0, 11111111b ; [57575757]
- 
--  pmaddwd xmm4, [%2+ 0]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
--  pmaddwd xmm5, [%2+16]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
--  pmaddwd xmm6, [%2+32]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
--  pmaddwd xmm7, [%2+48]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
-+  pmaddwd xmm4, [ 0 + %2]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
-+  pmaddwd xmm5, [16 + %2]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
-+  pmaddwd xmm6, [32 + %2]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
-+  pmaddwd xmm7, [48 + %2]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
-   paddd   xmm4, [%3]      ; Round
- 
-   paddd   xmm6, xmm7      ; [b0|b1|b2|b3]
-@@ -267,12 +267,12 @@ cglobal fdct_sse2_skal
- 
- %macro iLLM_PASS 1  ; %1: src/dst
- 
--  movdqa xmm0, [tan3]     ; t3-1
-+  movdqa xmm0, [ebx + tan3 wrt ..gotoff]     ; t3-1
-   movdqa xmm3, [%1+16*3]  ; x3
-   movdqa xmm1, xmm0       ; t3-1
-   movdqa xmm5, [%1+16*5]  ; x5
- 
--  movdqa xmm4, [tan1]     ; t1
-+  movdqa xmm4, [ebx + tan1 wrt ..gotoff]     ; t1
-   movdqa xmm6, [%1+16*1]  ; x1
-   movdqa xmm7, [%1+16*7]  ; x7
-   movdqa xmm2, xmm4       ; t1
-@@ -290,7 +290,7 @@ cglobal fdct_sse2_skal
-   psubsw xmm2, xmm7       ; x1*t1-x7 = tm17
- 
- 
--  movdqa xmm3, [sqrt2]
-+  movdqa xmm3, [ebx + sqrt2 wrt ..gotoff]
-   movdqa xmm7, xmm4
-   movdqa xmm6, xmm2
-   psubsw xmm4, xmm1       ; tp17-tp35 = t1
-@@ -310,7 +310,7 @@ cglobal fdct_sse2_skal
-   paddsw xmm0, xmm0       ; 2.(t1+t2) = b1
-   paddsw xmm4, xmm4       ; 2.(t1-t2) = b2
- 
--  movdqa xmm7, [tan2]     ; t2
-+  movdqa xmm7, [ebx + tan2 wrt ..gotoff]     ; t2
-   movdqa xmm3, [%1+2*16]  ; x2
-   movdqa xmm6, [%1+6*16]  ; x6
-   movdqa xmm5, xmm7       ; t2
-@@ -402,55 +402,58 @@ cglobal fdct_sse2_skal
- 
- ALIGN 16
- idct_sse2_skal:
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
- 
--  mov ecx, [esp+ 4]  ; Src
-+  mov ecx, [esp+ 4 +4]  ; Src
- 
-   TEST_ROW ecx, .Row0_Round
--  iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11
-+  iMTX_MULT  0, ebx + iTab1 wrt ..gotoff, ebx + 16*0 + Walken_Idct_Rounders wrt ..gotoff, 11
-   jmp .Row1
- .Row0_Round
--  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
-+  movdqa xmm0, [ebx + 16*8 + 8*0 + Walken_Idct_Rounders wrt ..gotoff]
-   movdqa [ecx  ], xmm0
- 
- .Row1
-   TEST_ROW ecx+16, .Row1_Round
--  iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11
-+  iMTX_MULT  1, ebx + iTab2 wrt ..gotoff, ebx + 16*1 + Walken_Idct_Rounders wrt ..gotoff, 11
-   jmp .Row2
- .Row1_Round
--  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
-+  movdqa xmm0, [ebx + 16*8 + 16*1 + Walken_Idct_Rounders wrt ..gotoff]
-   movdqa [ecx+16  ], xmm0
- 
- .Row2
-   TEST_ROW ecx+32, .Row2_Round
--  iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11
-+  iMTX_MULT  2, ebx + iTab3 wrt ..gotoff, ebx + 16*2 + Walken_Idct_Rounders wrt ..gotoff, 11
-   jmp .Row3
- .Row2_Round
--  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
-+  movdqa xmm0, [ebx + 16*8 + 16*2 + Walken_Idct_Rounders wrt ..gotoff]
-   movdqa [ecx+32  ], xmm0
- 
- .Row3
-   TEST_ROW ecx+48, .Row4
--  iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11
-+  iMTX_MULT  3, ebx + iTab4 wrt ..gotoff, ebx + 16*3 + Walken_Idct_Rounders wrt ..gotoff, 11
- 
- .Row4
-   TEST_ROW ecx+64, .Row5
--  iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11
-+  iMTX_MULT  4, ebx + iTab1 wrt ..gotoff, ebx + 16*4 + Walken_Idct_Rounders wrt ..gotoff, 11
- 
- .Row5
-   TEST_ROW ecx+80, .Row6
--  iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11
-+  iMTX_MULT  5, ebx + iTab4 wrt ..gotoff, ebx + 16*5 + Walken_Idct_Rounders wrt ..gotoff, 11
- 
- .Row6
-   TEST_ROW ecx+96, .Row7
--  iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11
-+  iMTX_MULT  6, ebx + iTab3 wrt ..gotoff, ebx + 16*6 + Walken_Idct_Rounders wrt ..gotoff, 11
- 
- .Row7
-   TEST_ROW ecx+112, .End
--  iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11
-+  iMTX_MULT  7, ebx + iTab2 wrt ..gotoff, ebx + 16*7 + Walken_Idct_Rounders wrt ..gotoff, 11
- .End
- 
-   iLLM_PASS ecx
--
-+  pop ebx
-   ret
- .endfunc
- 
-@@ -507,15 +510,15 @@ idct_sse2_skal:
-   paddsw xmm2, xmm1         ; xmm2: t6+t5
-   movdqa [%1+0*16], xmm5   ; => out0
- 
--  movdqa xmm4, [tan2]      ; xmm4 <= tan2
-+  movdqa xmm4, [ebx + tan2 wrt ..gotoff]      ; xmm4 <= tan2
-   pmulhw xmm4, xmm7         ; tm03*tan2
--  movdqa xmm5, [tan2]      ; xmm5 <= tan2
-+  movdqa xmm5, [ebx + tan2 wrt ..gotoff]      ; xmm5 <= tan2
-   psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
-   pmulhw xmm5, xmm6         ; tm12*tan2
-   paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03
- 
--  movdqa xmm6, [sqrt2]  
--  movdqa xmm7, [Rounder1]
-+  movdqa xmm6, [ebx + sqrt2 wrt ..gotoff]
-+  movdqa xmm7, [ebx + Rounder1 wrt ..gotoff]
- 
-   pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
-   por    xmm5, xmm7         ; correct out2
-@@ -533,8 +536,8 @@ idct_sse2_skal:
-   paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65
-   paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65
- 
--  movdqa xmm4, [tan3]      ; tan3 - 1
--  movdqa xmm5, [tan1]      ; tan1
-+  movdqa xmm4, [ebx + tan3 wrt ..gotoff]      ; tan3 - 1
-+  movdqa xmm5, [ebx + tan1 wrt ..gotoff]      ; tan1
- 
-   movdqa xmm7, xmm3         ; save tm465
-   pmulhw xmm3, xmm4         ; tm465*(tan3-1)
-@@ -581,12 +584,12 @@ idct_sse2_skal:
-     ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']
-     ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]
- 
--  movdqa  xmm1, [%2+16]
--  movdqa  xmm3, [%2+32]
-+  movdqa  xmm1, [16+%2]
-+  movdqa  xmm3, [32+%2]
-   pmaddwd xmm1, xmm2
-   pmaddwd xmm3, xmm0
--  pmaddwd xmm2, [%2+48]
--  pmaddwd xmm0, [%2+ 0]
-+  pmaddwd xmm2, [48+%2]
-+  pmaddwd xmm0, [ 0+%2]
- 
-   paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]
-   paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]
-@@ -601,22 +604,33 @@ idct_sse2_skal:
-   movdqa  [ecx+%1*16+0], xmm0
- %endmacro
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+  mov ebx, [esp]
-+  retn
-+
- ;-----------------------------------------------------------------------------
- ; Function Forward DCT
- ;-----------------------------------------------------------------------------
- 
- ALIGN 16
- fdct_sse2_skal:
--  mov ecx, [esp+4]
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov ecx, [esp+4+4]
-   fLLM_PASS ecx+0, 3
--  fMTX_MULT  0, fTab1, Fdct_Rnd0
--  fMTX_MULT  1, fTab2, Fdct_Rnd2
--  fMTX_MULT  2, fTab3, Fdct_Rnd1
--  fMTX_MULT  3, fTab4, Fdct_Rnd1
--  fMTX_MULT  4, fTab1, Fdct_Rnd0
--  fMTX_MULT  5, fTab4, Fdct_Rnd1
--  fMTX_MULT  6, fTab3, Fdct_Rnd1
--  fMTX_MULT  7, fTab2, Fdct_Rnd1
-+  fMTX_MULT  0, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff
-+  fMTX_MULT  1, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd2 wrt ..gotoff
-+  fMTX_MULT  2, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+  fMTX_MULT  3, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+  fMTX_MULT  4, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff
-+  fMTX_MULT  5, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+  fMTX_MULT  6, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+  fMTX_MULT  7, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+
-+  pop ebx
-   ret
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/idct_3dne.asm xvidcore-1.1.2/src/dct/x86_asm/idct_3dne.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/idct_3dne.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/idct_3dne.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -223,6 +223,11 @@ tab_i_35_xmm:
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+  mov ebx, [esp]
-+  retn
-+
- cglobal idct_3dne
- 
- ;-----------------------------------------------------------------------------
-@@ -231,25 +236,29 @@ cglobal idct_3dne
- 
- ALIGN 16
- idct_3dne:
--  mov eax, [esp+4]
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp+4+4]
- 
- ;   DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0
-   pshufw mm0, [eax+64],10001000b        ; x2 x0 x2 x0
--  movq mm3, [tab_i_04_xmm]          ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff]          ; 3     ; w05 w04 w01 w00
-   pshufw mm1, [eax+64+8],10001000b  ; x6 x4 x6 x4
--  movq mm4, [tab_i_04_xmm+8]        ; 4     ; w07 w06 w03 w02
-+  movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff]        ; 4     ; w07 w06 w03 w02
-   pshufw mm2, [eax+64],11011101b        ; x3 x1 x3 x1
-   pshufw mm5, [eax+64+8],11011101b  ; x7 x5 x7 x5
--  movq mm6, [tab_i_04_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
--  movq mm7, [tab_i_04_xmm+40]   ; 7     ; w23 w22 w19 w18 ;
--  pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18 ;
-+  pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
-   pshufw mm1, [eax+80+8],10001000b  ; x6 x4 x6 x4
-@@ -260,12 +269,12 @@ idct_3dne:
-   movq mm7, mm0                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
-   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_35_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1
-   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -276,19 +285,19 @@ idct_3dne:
-   movq [eax+64], mm6            ; 3     ; save y3 y2 y1 y0 stall2
- 
- ;   DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5
--  movq mm4, [tab_i_35_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_35_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
--  paddd mm3, [rounder_5]        ; +rounder stall 6
--  paddd mm5, [rounder_5]        ; +rounder
-+  paddd mm3, [ebx + rounder_5 wrt ..gotoff]        ; +rounder stall 6
-+  paddd mm5, [ebx + rounder_5 wrt ..gotoff]        ; +rounder
-   movq [eax+64+8], mm7          ; 7     ; save y7 y6 y5 y4
--  movq mm7, [tab_i_35_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
-   pshufw mm1, [eax+96+8],10001000b  ; x6 x4 x6 x4
-@@ -299,12 +308,12 @@ idct_3dne:
-   movq mm7, mm5                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
-   paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_26_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm5, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1
-   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -315,19 +324,19 @@ idct_3dne:
-   movq [eax+80], mm6            ; 3     ; save y3 y2 y1 y0
- 
- ;   DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6
--  movq mm4, [tab_i_26_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_26_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
--  paddd mm3, [rounder_6]        ; +rounder
--  paddd mm0, [rounder_6]        ; +rounder
-+  paddd mm3, [ebx + rounder_6 wrt ..gotoff]        ; +rounder
-+  paddd mm0, [ebx + rounder_6 wrt ..gotoff]        ; +rounder
-   movq [eax+80+8], mm7          ; 7     ; save y7 y6
--  movq mm7, [tab_i_26_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
-   pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4
-@@ -338,12 +347,12 @@ idct_3dne:
-   movq mm7, mm0                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
-   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_17_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1
-   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -354,19 +363,19 @@ idct_3dne:
-   movq [eax+96], mm6            ; 3     ; save y3 y2 y1 y0 stall2
- 
- ;   DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7
--  movq mm4, [tab_i_17_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_17_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
--  paddd mm3, [rounder_7]        ; +rounder stall 6
--  paddd mm5, [rounder_7]        ; +rounder
-+  paddd mm3, [ebx + rounder_7 wrt ..gotoff]        ; +rounder stall 6
-+  paddd mm5, [ebx + rounder_7 wrt ..gotoff]        ; +rounder
-   movq [eax+96+8], mm7          ; 7     ; save y7 y6 y5 y4
--  movq mm7, [tab_i_17_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
-   pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4
-@@ -377,12 +386,12 @@ idct_3dne:
-   movq mm7, mm5                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
-   paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_04_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm5, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1
-   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -393,19 +402,19 @@ idct_3dne:
-   movq [eax+112], mm6           ; 3     ; save y3 y2 y1 y0
- 
- ;   DCT_8_INV_ROW_1_s [eax+0],  0, tab_i_04_xmm, rounder_0
--  movq mm4, [tab_i_04_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_04_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
--  paddd mm3, [rounder_0]        ; +rounder
--  paddd mm0, [rounder_0]        ; +rounder
-+  paddd mm3, [ebx + rounder_0 wrt ..gotoff]        ; +rounder
-+  paddd mm0, [ebx + rounder_0 wrt ..gotoff]        ; +rounder
-   movq [eax+112+8], mm7         ; 7     ; save y7 y6
--  movq mm7, [tab_i_04_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm0, mm1                ; 1
-   pshufw mm1, [eax+16+8],10001000b  ; x6 x4 x6 x4
-@@ -416,12 +425,12 @@ idct_3dne:
-   movq mm7, mm0                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
-   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_17_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1
-   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -432,19 +441,19 @@ idct_3dne:
-   movq [eax+0], mm6             ; 3     ; save y3 y2 y1 y0 stall2
- 
- ; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1
--  movq mm4, [tab_i_17_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_17_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
--  paddd mm3, [rounder_1]        ; +rounder stall 6
--  paddd mm5, [rounder_1]        ; +rounder
-+  paddd mm3, [ebx + rounder_1 wrt ..gotoff]        ; +rounder stall 6
-+  paddd mm5, [ebx + rounder_1 wrt ..gotoff]        ; +rounder
-   movq [eax+0+8], mm7           ; 7     ; save y7 y6 y5 y4
--  movq mm7, [tab_i_17_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
-   pshufw mm1, [eax+32+8],10001000b  ; x6 x4 x6 x4
-@@ -455,12 +464,12 @@ idct_3dne:
-   movq mm7, mm5                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
-   paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_26_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm5, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1
-   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -471,19 +480,19 @@ idct_3dne:
-   movq [eax+16], mm6            ; 3     ; save y3 y2 y1 y0
- 
- ;   DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2
--  movq mm4, [tab_i_26_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_26_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
--  paddd mm3, [rounder_2]        ; +rounder
--  paddd mm0, [rounder_2]        ; +rounder
-+  paddd mm3, [ebx + rounder_2 wrt ..gotoff]        ; +rounder
-+  paddd mm0, [ebx + rounder_2 wrt ..gotoff]        ; +rounder
-   movq [eax+16+8], mm7          ; 7     ; save y7 y6
--  movq mm7, [tab_i_26_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
-   pshufw mm1, [eax+48+8],10001000b      ; x6 x4 x6 x4
-@@ -494,12 +503,12 @@ idct_3dne:
-   movq mm7, mm0                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
-   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
--  movq mm3, [tab_i_35_xmm]      ; 3     ; w05 w04 w01 w00
-+  movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
-   psubd mm7, mm2                ; ; a3-b3 a2-b2
-   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
-   pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1
-   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
--  pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
-   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
-   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
-   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -510,26 +519,26 @@ idct_3dne:
-   movq [eax+32], mm6            ; 3     ; save y3 y2 y1 y0 stall2
- 
- ;   DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3
--  movq mm4, [tab_i_35_xmm+8]    ; 4     ; w07 w06 w03 w02
--  movq mm6, [tab_i_35_xmm+32]   ; 6     ; w21 w20 w17 w16
-+  movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
-+  movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
-   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
--  paddd mm3, [rounder_3]        ; +rounder stall 6
--  paddd mm5, [rounder_3]        ; +rounder
-+  paddd mm3, [ebx + rounder_3 wrt ..gotoff]        ; +rounder stall 6
-+  paddd mm5, [ebx + rounder_3 wrt ..gotoff]        ; +rounder
-   movq [eax+32+8], mm7          ; 7     ; save y7 y6 y5 y4
--  movq mm7, [tab_i_35_xmm+40]   ; 7     ; w23 w22 w19 w18
-+  movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
--  pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
-   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
--  pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
-   paddd mm5, mm1                ; mm1 free  ; a3=sum(even3) a2=sum(even2)
--  movq mm1, [tg_3_16]
-+  movq mm1, [ebx + tg_3_16 wrt ..gotoff]
-   movq mm4, mm3                 ; 4     ; a1 a0
-   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
-   paddd mm2, mm0                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
--  movq mm0, [tg_3_16]
-+  movq mm0, [ebx + tg_3_16 wrt ..gotoff]
-   movq mm7, mm5                 ; 7     ; a3 a2
-   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
-   paddd mm3, mm6                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-@@ -542,7 +551,7 @@ idct_3dne:
-   psrad mm2, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
-   movq mm6, [eax+16*1]
-   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
--  movq mm4, [tg_1_16]
-+  movq mm4, [ebx + tg_1_16 wrt ..gotoff]
-   packssdw mm3, mm2             ; 0 free    ; y3 y2 y1 y0
-   pshufw mm2, mm7, 10110001b    ; y7 y6 y5 y4
- 
-@@ -559,7 +568,7 @@ idct_3dne:
-   paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
-   psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
-   movq [eax+48], mm3        ; 3     ; save y3 y2 y1 y0
--  movq mm3, [ocos_4_16]
-+  movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
-   paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
-   paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
-   psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
-@@ -569,7 +578,7 @@ idct_3dne:
-   psubsw mm6, mm0           ; tm17-tm35 = b3
-   psubsw mm4, mm1           ; tp17-tp35 = t1
-   paddsw mm2, mm0           ; tm17+tm35 = t2
--  movq mm7, [tg_2_16]
-+  movq mm7, [ebx + tg_2_16 wrt ..gotoff]
-   movq mm1, mm4             ; t1
-   movq [eax+3*16], mm5      ; save b0
-   paddsw mm1, mm2           ; t1+t2
-@@ -620,7 +629,7 @@ idct_3dne:
-   movq mm6, mm2             ; a3
-   psraw mm4, SHIFT_INV_COL  ; dst7
-   movq [eax+5*16], mm0
--  movq mm0, [tg_3_16]
-+  movq mm0, [ebx + tg_3_16 wrt ..gotoff]
-   paddsw mm2, mm3           ; a3+b3
-   movq [eax+6*16], mm7
-   psubsw mm6, mm3           ; a3-b3
-@@ -634,7 +643,7 @@ idct_3dne:
-   movq mm5, [eax+8+16*5]
-   psraw mm6, SHIFT_INV_COL  ; dst4
-   pmulhw mm0, mm3           ; x3*(tg_3_16-1)
--  movq mm4, [tg_1_16]
-+  movq mm4, [ebx + tg_1_16 wrt ..gotoff]
-   pmulhw mm1, mm5           ; x5*(tg_3_16-1)
-   movq mm7, [eax+8+16*7]
-   movq [eax+3*16], mm2
-@@ -646,7 +655,7 @@ idct_3dne:
-   pmulhw mm2, mm6           ; x1*tg_1_16
-   paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
-   psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
--  movq mm3, [ocos_4_16]
-+  movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
-   paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
-   paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
-   psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
-@@ -655,7 +664,7 @@ idct_3dne:
-   paddsw mm5, mm1           ; tp17+tp35 = b0
-   psubsw mm4, mm1           ; tp17-tp35 = t1
-   paddsw mm2, mm0           ; tm17+tm35 = t2
--  movq mm7, [tg_2_16]
-+  movq mm7, [ebx + tg_2_16 wrt ..gotoff]
-   movq mm1, mm4             ; t1
-   psubsw mm6, mm0           ; tm17-tm35 = b3
-   movq [eax+8+3*16], mm5    ; save b0
-@@ -717,6 +726,7 @@ idct_3dne:
-   movq [eax+8+3*16], mm2
-   movq [eax+8+4*16], mm6
- 
-+  pop ebx
-   ret
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/idct_mmx.asm xvidcore-1.1.2/src/dct/x86_asm/idct_mmx.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/idct_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/idct_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -326,25 +326,25 @@ tab_i_35_xmm:
-   punpcklwd mm0, mm1        ; x5 x1 x4 x0
-   movq mm5, mm0             ; 5 ; x5 x1 x4 x0
-   punpckldq mm0, mm0        ; x4 x0 x4 x0
--  movq mm4, [%3+8]          ; 4 ; w07 w05 w03 w01
-+  movq mm4, [8+%3]          ; 4 ; w07 w05 w03 w01
-   punpckhwd mm2, mm1        ; 1 ; x7 x3 x6 x2
-   pmaddwd mm3, mm0          ; x4*w06+x0*w04 x4*w02+x0*w00
-   movq mm6, mm2             ; 6 ; x7 x3 x6 x2
--  movq mm1, [%3+32]         ; 1 ; w22 w20 w18 w16
-+  movq mm1, [32+%3]         ; 1 ; w22 w20 w18 w16
-   punpckldq mm2, mm2        ; x6 x2 x6 x2
-   pmaddwd mm4, mm2          ; x6*w07+x2*w05 x6*w03+x2*w01
-   punpckhdq mm5, mm5        ; x5 x1 x5 x1
--  pmaddwd mm0, [%3+16]      ; x4*w14+x0*w12 x4*w10+x0*w08
-+  pmaddwd mm0, [16+%3]      ; x4*w14+x0*w12 x4*w10+x0*w08
-   punpckhdq mm6, mm6        ; x7 x3 x7 x3
--  movq mm7, [%3+40]         ; 7 ; w23 w21 w19 w17
-+  movq mm7, [40+%3]         ; 7 ; w23 w21 w19 w17
-   pmaddwd mm1, mm5          ; x5*w22+x1*w20 x5*w18+x1*w16
-   paddd mm3, [%4]           ; +%4
-   pmaddwd mm7, mm6          ; x7*w23+x3*w21 x7*w19+x3*w17
--  pmaddwd mm2, [%3+24]      ; x6*w15+x2*w13 x6*w11+x2*w09
-+  pmaddwd mm2, [24+%3]      ; x6*w15+x2*w13 x6*w11+x2*w09
-   paddd mm3, mm4            ; 4 ; a1=sum(even1) a0=sum(even0)
--  pmaddwd mm5, [%3+48]      ; x5*w30+x1*w28 x5*w26+x1*w24
-+  pmaddwd mm5, [48+%3]      ; x5*w30+x1*w28 x5*w26+x1*w24
-   movq mm4, mm3             ; 4 ; a1 a0
--  pmaddwd mm6, [%3+56]      ; x7*w31+x3*w29 x7*w27+x3*w25
-+  pmaddwd mm6, [56+%3]      ; x7*w31+x3*w29 x7*w27+x3*w25
-   paddd mm1, mm7            ; 7 ; b1=sum(odd1) b0=sum(odd0)
-   paddd mm0, [%4]           ; +%4
-   psubd mm3, mm1            ; a1-b1 a0-b0
-@@ -378,25 +378,25 @@ tab_i_35_xmm:
-   movq mm2, mm0                 ; 2     ; x3 x2 x1 x0
-   movq mm3, [%3]                ; 3     ; w05 w04 w01 w00
-   pshufw mm0, mm0, 10001000b    ; x2 x0 x2 x0
--  movq mm4, [%3+8]              ; 4     ; w07 w06 w03 w02
-+  movq mm4, [8+%3]              ; 4     ; w07 w06 w03 w02
-   movq mm5, mm1                 ; 5     ; x7 x6 x5 x4
-   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
--  movq mm6, [%3+32]             ; 6     ; w21 w20 w17 w16
-+  movq mm6, [32+%3]             ; 6     ; w21 w20 w17 w16
-   pshufw mm1, mm1, 10001000b    ; x6 x4 x6 x4
-   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
--  movq mm7, [%3+40]             ; 7    ; w23 w22 w19 w18
-+  movq mm7, [40+%3]             ; 7    ; w23 w22 w19 w18
-   pshufw mm2, mm2, 11011101b    ; x3 x1 x3 x1
-   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
-   pshufw mm5, mm5, 11011101b    ; x7 x5 x7 x5
-   pmaddwd mm7, mm5              ; x7*w23+x5*w22 x7*w19+x5*w18
-   paddd mm3, [%4]               ; +%4
--  pmaddwd mm0, [%3+16]          ; x2*w13+x0*w12 x2*w09+x0*w08
-+  pmaddwd mm0, [16+%3]          ; x2*w13+x0*w12 x2*w09+x0*w08
-   paddd mm3, mm4                ; 4     ; a1=sum(even1) a0=sum(even0)
--  pmaddwd mm1, [%3+24]          ; x6*w15+x4*w14 x6*w11+x4*w10
-+  pmaddwd mm1, [24+%3]          ; x6*w15+x4*w14 x6*w11+x4*w10
-   movq mm4, mm3                 ; 4     ; a1 a0
--  pmaddwd mm2, [%3+48]          ; x3*w29+x1*w28 x3*w25+x1*w24
-+  pmaddwd mm2, [48+%3]          ; x3*w29+x1*w28 x3*w25+x1*w24
-   paddd mm6, mm7                ; 7     ; b1=sum(odd1) b0=sum(odd0)
--  pmaddwd mm5, [%3+56]          ; x7*w31+x5*w30 x7*w27+x5*w26
-+  pmaddwd mm5, [56+%3]          ; x7*w31+x5*w30 x7*w27+x5*w26
-   paddd mm3, mm6                ; a1+b1 a0+b0
-   paddd mm0, [%4]               ; +%4
-   psrad mm3, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
-@@ -480,12 +480,12 @@ tab_i_35_xmm:
- ;-----------------------------------------------------------------------------
- 
- %macro DCT_8_INV_COL 2
--  movq mm0, [tg_3_16]
-+  movq mm0, [ebx + tg_3_16 wrt ..gotoff]
-   movq mm3, [%1+16*3]
-   movq mm1, mm0             ; tg_3_16
-   movq mm5, [%1+16*5]
-   pmulhw mm0, mm3           ; x3*(tg_3_16-1)
--  movq mm4, [tg_1_16]
-+  movq mm4, [ebx + tg_1_16 wrt ..gotoff]
-   pmulhw mm1, mm5           ; x5*(tg_3_16-1)
-   movq mm7, [%1+16*7]
-   movq mm2, mm4             ; tg_1_16
-@@ -495,7 +495,7 @@ tab_i_35_xmm:
-   pmulhw mm2, mm6           ; x1*tg_1_16
-   paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
-   psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
--  movq mm3, [ocos_4_16]
-+  movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
-   paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
-   paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
-   psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
-@@ -505,7 +505,7 @@ tab_i_35_xmm:
-   psubsw mm6, mm0           ; tm17-tm35 = b3
-   psubsw mm4, mm1           ; tp17-tp35 = t1
-   paddsw mm2, mm0           ; tm17+tm35 = t2
--  movq mm7, [tg_2_16]
-+  movq mm7, [ebx + tg_2_16 wrt ..gotoff]
-   movq mm1, mm4             ; t1
- ;  movq [SCRATCH+0], mm5    ; save b0
-   movq [%2+3*16], mm5       ; save b0
-@@ -577,6 +577,11 @@ tab_i_35_xmm:
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+    mov ebx, [esp]
-+    retn
-+
- cglobal idct_mmx
- cglobal idct_xmm
- 
-@@ -586,22 +591,27 @@ cglobal idct_xmm
- 
- ALIGN 16
- idct_mmx:
--    mov eax, dword [esp + 4]
-+    push ebx
-+    call get_pc.bx
-+    add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+    mov eax, dword [esp + 4 + 4]
- 
- 	;; Process each row
--    DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, tab_i_04_mmx, rounder_0
--    DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, tab_i_17_mmx, rounder_1
--    DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, tab_i_26_mmx, rounder_2
--    DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, tab_i_35_mmx, rounder_3
--    DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, tab_i_04_mmx, rounder_4
--    DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, tab_i_35_mmx, rounder_5
--    DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, tab_i_26_mmx, rounder_6
--    DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, tab_i_17_mmx, rounder_7
-+    DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_0 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_1 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_2 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_3 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_4 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_5 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_6 wrt ..gotoff
-+    DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_7 wrt ..gotoff
- 
- 	;; Process the columns (4 at a time)
-     DCT_8_INV_COL eax+0, eax+0
-     DCT_8_INV_COL eax+8, eax+8
- 
-+    pop ebx
-     ret
- .endfunc
- 
-@@ -611,22 +621,27 @@ idct_mmx:
- 
- ALIGN 16
- idct_xmm:
--    mov eax, dword [esp + 4]
-+    push ebx
-+    call get_pc.bx
-+    add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+    mov eax, dword [esp + 4 + 4]
- 
- 	;; Process each row
--    DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, tab_i_04_xmm, rounder_0
--    DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, tab_i_17_xmm, rounder_1
--    DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, tab_i_26_xmm, rounder_2
--    DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, tab_i_35_xmm, rounder_3
--    DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, tab_i_04_xmm, rounder_4
--    DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, tab_i_35_xmm, rounder_5
--    DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, tab_i_26_xmm, rounder_6
--    DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, tab_i_17_xmm, rounder_7
-+    DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_0 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_1 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_2 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_3 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_4 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_5 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_6 wrt ..gotoff
-+    DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_7 wrt ..gotoff
- 
- 	;; Process the columns (4 at a time)
-     DCT_8_INV_COL eax+0, eax+0
-     DCT_8_INV_COL eax+8, eax+8
- 
-+    pop ebx
-     ret
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/idct_sse2_dmitry.asm xvidcore-1.1.2/src/dct/x86_asm/idct_sse2_dmitry.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/idct_sse2_dmitry.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/idct_sse2_dmitry.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -183,7 +183,7 @@ cglobal idct_sse2_dmitry
- 
-   ;a 3210 first part
-   pshufd xmm2, xmm1, 10101010b      ;x 64646464
--  pmaddwd xmm2, [%3+16]             ;w 15 14 11 10 7632
-+  pmaddwd xmm2, [16+%3]             ;w 15 14 11 10 7632
- 
-   ;a 3210 second part
-   paddd xmm2, xmm0                  ;a 3210 ready
-@@ -191,11 +191,11 @@ cglobal idct_sse2_dmitry
-   movdqa xmm5, xmm2
- 
-   pshufd xmm3, xmm1, 01010101b      ;x 31313131
--  pmaddwd xmm3, [%3+32]             ;w 29 28 25 24 21 20 17 16
-+  pmaddwd xmm3, [32+%3]             ;w 29 28 25 24 21 20 17 16
- 
-   ;b 3210 first part
-   pshufd xmm4, xmm1, 11111111b      ;x 75757575
--  pmaddwd xmm4, [%3+48]             ;w 31 30 27 26 23 22 19 18
-+  pmaddwd xmm4, [48+%3]             ;w 31 30 27 26 23 22 19 18
- 
-   ;b 3210 second part
-   paddd xmm3,xmm4                   ;b 3210 ready
-@@ -220,7 +220,7 @@ cglobal idct_sse2_dmitry
- 
-   movdqa xmm4, [%1+16*2]          	;x2
-   movdqa xmm5, [%1+16*6]          	;x6
--  movdqa xmm6, [tg_2_16]
-+  movdqa xmm6, [ebx + tg_2_16 wrt ..gotoff]
-   movdqa xmm7, xmm6
- 
-   paddsw xmm0, xmm2                  ;u04=x0+x4
-@@ -245,12 +245,12 @@ cglobal idct_sse2_dmitry
- 
-   movdqa xmm0, [%1+16*1]          	;x1
-   movdqa xmm1, [%1+16*7]          	;x7
--  movdqa xmm2, [tg_1_16]
-+  movdqa xmm2, [ebx + tg_1_16 wrt ..gotoff]
-   movdqa xmm3, xmm2
- 
-   movdqa xmm4, [%1+16*3]          	;x3
-   movdqa xmm5, [%1+16*5]          	;x5
--  movdqa xmm6, [tg_3_16]
-+  movdqa xmm6, [ebx + tg_3_16 wrt ..gotoff]
-   movdqa xmm7, xmm6
- 
-   pmulhw xmm2, xmm0
-@@ -267,7 +267,7 @@ cglobal idct_sse2_dmitry
-   psubsw xmm6, xmm5                  ;v35=x3*T3-x5
-   paddsw xmm7, xmm4                  ;u35=x5*T3+x3
- 
--  movdqa xmm4, [ocos_4_16]
-+  movdqa xmm4, [ebx + ocos_4_16 wrt ..gotoff]
- 
-   paddsw xmm0, xmm7                 ;b0=u17+u35
-   psubsw xmm1, xmm6                 ;b3=v17-v35
-@@ -322,26 +322,35 @@ cglobal idct_sse2_dmitry
-   movdqa [%2+16*5], xmm7
- %endmacro
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+  mov ebx, [esp]
-+  retn
-+
- ;-----------------------------------------------------------------------------
- ; void idct_sse2_dmitry(int16_t coeff[64]);
- ;-----------------------------------------------------------------------------
- 
- ALIGN 16
- idct_sse2_dmitry:
--
--  mov eax, [esp + 4]
--
--  DCT_8_INV_ROW_1_SSE2 eax+  0, eax+  0, tab_i_04, rounder_2_0
--  DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, tab_i_17, rounder_2_1
--  DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, tab_i_26, rounder_2_2
--  DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, tab_i_35, rounder_2_3
--  DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, tab_i_04, rounder_2_4
--  DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, tab_i_35, rounder_2_5
--  DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, tab_i_26, rounder_2_6
--  DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, tab_i_17, rounder_2_7
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 4 + 4]
-+
-+  DCT_8_INV_ROW_1_SSE2 eax+  0, eax+  0, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_0 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_1 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_2 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_3 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_4 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_5 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_6 wrt ..gotoff
-+  DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_7 wrt ..gotoff
- 
-   DCT_8_INV_COL_4_SSE2 eax, eax
- 
-+  pop ebx
-   ret
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/simple_idct_mmx.asm xvidcore-1.1.2/src/dct/x86_asm/simple_idct_mmx.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/simple_idct_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/simple_idct_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -122,7 +122,7 @@ coeffs:
-   movq mm1,[src4]               ; R6    R2  r6  r2
-   movq mm2,[src1]               ; R3    R1  r3  r1
-   movq mm3,[src5]               ; R7    R5  r7  r5
--  movq mm4,[wm1010]
-+  movq mm4,[ebx + wm1010 wrt ..gotoff]
-   pand mm4,mm0
-   por mm4,mm1
-   por mm4,mm2
-@@ -131,29 +131,29 @@ coeffs:
-   movd eax,mm4
-   or eax,eax
-   jz near .skip1
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
--  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
-+  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
-   pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
--  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
-+  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
-   pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
-   rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
-   paddd mm4,mm5                 ; A0        a0
-   psubd mm6,mm5                 ; A3        a3
--  movq mm5,[coeffs+56]          ; C7    C5  C7  C5
-+  movq mm5,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
-   pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
-   rounder_op mm0, rounder_arg
-   paddd mm1,mm0                 ; A1        a1
-   paddd mm0,mm0
-   psubd mm0,mm1                 ; A2        a2
--  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
-+  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
-   paddd mm7,mm5                 ; B0        b0
--  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1
-+  movq mm5,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
-   pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
-   paddd mm7,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -170,14 +170,14 @@ coeffs:
-   packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
-   movq [dst],mm7
-   movq mm1,[src1]               ; R3    R1  r3  r1
--  movq mm4,[coeffs+80]          ;-C1    C5  -C1     C5
-+  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ;-C1    C5  -C1     C5
-   movq [dst + 24],mm2
-   pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
--  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
--  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
-+  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
-+  pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
-   pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
-   movq mm2,mm0                  ; A2        a2
--  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
-+  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
-   paddd mm4,mm7                 ; B2        b2
-   paddd mm2,mm4                 ; A2+B2     a2+b2
-   psubd mm0,mm4                 ; a2-B2     a2-b2
-@@ -196,7 +196,7 @@ coeffs:
-   jmp short .skip2
- .skip1
-   pslld mm0,16
--  paddd mm0,[d40000]
-+  paddd mm0,[ebx + d40000 wrt ..gotoff]
-   psrad mm0,13
-   packssdw mm0,mm0
-   movq [ dst ],mm0
-@@ -240,29 +240,29 @@ coeffs:
-   movd eax,mm4
-   or eax,eax
-   jz near bt
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
--  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
-+  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
-   pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
--  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
-+  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
-   pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
-   rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
-   paddd mm4,mm5                 ; A0        a0
-   psubd mm6,mm5                 ; A3        a3
--  movq mm5,[coeffs+56]          ; C7    C5  C7  C5
-+  movq mm5,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
-   pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
-   rounder_op mm0, rounder_arg
-   paddd mm1,mm0                 ; A1        a1
-   paddd mm0,mm0
-   psubd mm0,mm1                 ; A2        a2
--  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
-+  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
-   paddd mm7,mm5                 ; B0        b0
--  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1
-+  movq mm5,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
-   pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
-   paddd mm7,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -279,14 +279,14 @@ coeffs:
-   packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
-   movq [ dst ],mm7
-   movq mm1,[src1]               ; R3    R1  r3  r1
--  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
-+  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
-   movq [ dst + 24 ],mm2
-   pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
--  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
--  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
-+  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
-+  pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
-   pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
-   movq mm2,mm0                  ; A2        a2
--  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
-+  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
-   paddd mm4,mm7                 ; B2        b2
-   paddd mm2,mm4                 ; A2+B2     a2+b2
-   psubd mm0,mm4                 ; a2-B2     a2-b2
-@@ -330,17 +330,17 @@ coeffs:
-   movq mm1,[src4]               ; R6    R2  r6  r2
-   movq mm2,[src1]               ; R3    R1  r3  r1
-   movq mm3,[src5]               ; R7    R5  r7  r5
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
--  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
-+  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
-   pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
--  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
-+  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
-   ; rounder_op mm0, rounder_arg
-   pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
-   paddd mm4,mm5                 ; A0        a0
-@@ -348,11 +348,11 @@ coeffs:
-   movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
-   paddd mm0,mm1                 ; A1        a1
-   psubd mm5,mm1                 ; A2        a2
--  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
-+  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
-   pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
--  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
-+  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
-   paddd mm7,mm1                 ; B0        b0
--  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1
-+  movq mm1,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
-   pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
-   paddd mm7,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -374,13 +374,13 @@ coeffs:
-   packssdw mm4,mm4              ; A0-B0 a0-b0
-   movd [ dst + 112],mm4
-   movq mm0,[src1]               ; R3    R1  r3  r1
--  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
-+  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
-   pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
--  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
--  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
-+  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
-+  pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
-   pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
-   movq mm2,mm5                  ; A2        a2
--  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
-+  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
-   paddd mm4,mm7                 ; B2        b2
-   paddd mm2,mm4                 ; A2+B2     a2+b2
-   psubd mm5,mm4                 ; a2-B2     a2-b2
-@@ -426,13 +426,13 @@ coeffs:
-   movq mm0,[src0]               ; R4    R0  r4  r0
-   movq mm1,[src4]               ; R6    R2  r6  r2
-   movq mm3,[src5]               ; R7    R5  r7  r5
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
--  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
-+  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
-   pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
-@@ -442,9 +442,9 @@ coeffs:
-   movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
-   paddd mm0,mm1                 ; A1        a1
-   psubd mm5,mm1                 ; A2        a2
--  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
-+  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
-   pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
--  movq mm7,[coeffs+72]          ; -C5   -C1 -C5 -C1
-+  movq mm7,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
-   pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
-   paddd mm1,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -464,10 +464,10 @@ coeffs:
-   movd [ dst + 96 ],mm2
-   packssdw mm4,mm4              ; A0-B0 a0-b0
-   movd [ dst + 112 ],mm4
--  movq mm1,[coeffs+88]          ; C3    C7  C3  C7
-+  movq mm1,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
-   pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
-   movq mm2,mm5                  ; A2        a2
--  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
-+  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
-   paddd mm2,mm1                 ; A2+B2     a2+b2
-   psubd mm5,mm1                 ; a2-B2     a2-b2
-   psrad mm2,shift
-@@ -510,17 +510,17 @@ coeffs:
- %define	shift		%8
-   movq mm0,[src0]               ; R4    R0  r4  r0
-   movq mm3,[src5]               ; R7    R5  r7  r5
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
-   ; rounder_op mm0, rounder_arg
-   movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
-+  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
-   pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
--  movq mm7,[coeffs+72]          ; -C5   -C1 -C5 -C1
-+  movq mm7,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
-   pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
-   paddd mm1,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -540,10 +540,10 @@ coeffs:
-   movd [ dst + 96 ],mm2
-   packssdw mm4,mm4              ; A0-B0 a0-b0
-   movd [ dst + 112 ],mm4
--  movq mm1,[coeffs+88]          ; C3    C7  C3  C7
-+  movq mm1,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
-   pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
-   movq mm2,mm5                  ; A2        a2
--  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
-+  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
-   paddd mm2,mm1                 ; A2+B2     a2+b2
-   psubd mm5,mm1                 ; a2-B2     a2-b2
-   psrad mm2,shift
-@@ -587,21 +587,21 @@ coeffs:
-   movq mm0,[src0]               ; R4    R0  r4  r0
-   movq mm2,[src1]               ; R3    R1  r3  r1
-   movq mm3,[src5]               ; R7    R5  r7  r5
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
--  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
-+  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
-   ; rounder_op mm0, rounder_arg
-   pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
-   movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
-+  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
-   pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
--  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
-+  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
-   paddd mm7,mm1                 ; B0        b0
--  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1
-+  movq mm1,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
-   pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
-   paddd mm7,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -623,13 +623,13 @@ coeffs:
-   packssdw mm4,mm4              ; A0-B0 a0-b0
-   movd [dst + 112],mm4
-   movq mm0,[src1]               ; R3    R1  r3  r1
--  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
-+  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
-   pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
--  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
--  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
-+  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
-+  pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
-   pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
-   movq mm2,mm5                  ; A2        a2
--  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
-+  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
-   paddd mm4,mm7                 ; B2        b2
-   paddd mm2,mm4                 ; A2+B2     a2+b2
-   psubd mm5,mm4                 ; a2-B2     a2-b2
-@@ -674,17 +674,17 @@ coeffs:
- %define shift       %8
-   movq mm0,[src0]               ; R4    R0  r4  r0
-   movq mm2,[src1]               ; R3    R1  r3  r1
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
--  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
-+  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
-   ; rounder_op mm0, rounder_arg
-   pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
-   movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm3,[coeffs+64]
-+  movq mm3,[ebx + coeffs+64 wrt ..gotoff]
-   pmaddwd mm3,mm2               ; -C7R3+C3R1    -C7r3+C3r1
-   paddd mm7,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -704,9 +704,9 @@ coeffs:
-   movd [dst + 96],mm1
-   packssdw mm4,mm4              ; A0-B0 a0-b0
-   movd [dst + 112],mm4
--  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
-+  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
-   pmaddwd mm4,mm2               ; -C1R3+C5R1    -C1r3+C5r1
--  pmaddwd mm2,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
-+  pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
-   movq mm1,mm5                  ; A2        a2
-   paddd mm1,mm4                 ; A2+B2     a2+b2
-   psubd mm5,mm4                 ; a2-B2     a2-b2
-@@ -750,13 +750,13 @@ coeffs:
- %define	shift		%8
-   movq mm0,[src0]               ; R4    R0  r4  r0
-   movq mm1,[src4]               ; R6    R2  r6  r2
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
--  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
-+  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
-   pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
-@@ -768,13 +768,13 @@ coeffs:
-   psubd mm5,mm1                 ; A2        a2
-   movq mm2,[src0 + 8]           ; R4    R0  r4  r0
-   movq mm3,[src4 + 8]           ; R6    R2  r6  r2
--  movq mm1,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm1,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm1,mm2               ; C4R4+C4R0 C4r4+C4r0
--  movq mm7,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm7,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm2,mm7               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm7,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm7,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm7,mm3               ; C6R6+C2R2 C6r6+C2r2
--  pmaddwd mm3,[coeffs+40]       ; -C2R6+C6R2    -C2r6+C6r2
-+  pmaddwd mm3,[ebx + coeffs+40 wrt ..gotoff]       ; -C2R6+C6R2    -C2r6+C6r2
-   ; rounder_op mm1, rounder_arg
-   paddd mm7,mm1                 ; A0        a0
-   paddd mm1,mm1                 ; 2C0       2c0
-@@ -829,17 +829,17 @@ coeffs:
-   movq mm0,[src0]               ; R4    R0  r4  r0
-   movq mm1,[src4]               ; R6    R2  r6  r2
-   movq mm2,[src1]               ; R3    R1  r3  r1
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
--  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
-+  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
-   pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
-   ; rounder_op mm4, rounder_arg
-   movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
--  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
-+  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
-   ; rounder_op mm0, rounder_arg
-   pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
-   paddd mm4,mm5                 ; A0        a0
-@@ -847,7 +847,7 @@ coeffs:
-   movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
-   paddd mm0,mm1                 ; A1        a1
-   psubd mm5,mm1                 ; A2        a2
--  movq mm1,[coeffs+64]
-+  movq mm1,[ebx + coeffs+64 wrt ..gotoff]
-   pmaddwd mm1,mm2               ; -C7R3+C3R1    -C7r3+C3r1
-   paddd mm7,mm4                 ; A0+B0     a0+b0
-   paddd mm4,mm4                 ; 2A0       2a0
-@@ -867,9 +867,9 @@ coeffs:
-   movd [dst + 96],mm3
-   packssdw mm4,mm4              ; A0-B0 a0-b0
-   movd [dst + 112],mm4
--  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
-+  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
-   pmaddwd mm4,mm2               ; -C1R3+C5R1    -C1r3+C5r1
--  pmaddwd mm2,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
-+  pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
-   movq mm3,mm5                  ; A2        a2
-   paddd mm3,mm4                 ; A2+B2     a2+b2
-   psubd mm5,mm4                 ; a2-B2     a2-b2
-@@ -912,20 +912,20 @@ coeffs:
- %define	rounder_arg	%7
- %define	shift		%8
-   movq mm0,[src0]               ; R4    R0  r4  r0
--  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
--  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
-   ; rounder_op mm4, rounder_arg
-   ; rounder_op mm0, rounder_arg
-   psrad mm4,shift
-   psrad mm0,shift
-   movq mm2,[src0 + 8]           ; R4    R0  r4  r0
--  movq mm1,[coeffs+16]          ; C4    C4  C4  C4
-+  movq mm1,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
-   pmaddwd mm1,mm2               ; C4R4+C4R0 C4r4+C4r0
--  movq mm7,[coeffs+24]          ; -C4   C4  -C4 C4
-+  movq mm7,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
-   pmaddwd mm2,mm7               ; -C4R4+C4R0    -C4r4+C4r0
--  movq mm7,[coeffs+32]          ; C6    C2  C6  C2
-+  movq mm7,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
-   ; rounder_op mm1, rounder_arg
-   ; rounder_op mm2, rounder_arg
-   psrad mm1,shift
-@@ -1073,6 +1073,11 @@ coeffs:
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+  mov ebx, [esp]
-+  retn
-+
- cglobal simple_idct_mmx_P
- cglobal simple_idct_mmx
- 
-@@ -1083,14 +1088,18 @@ cglobal simple_idct_mmx
- 
- ALIGN 16
- simple_idct_mmx_P:
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-   sub esp, 128
--  mov edx, [esp+128+4]
-+  mov edx, [esp+128+4+4]
- 
- ;               src0,   src4,   src1,   src5,   dst,    rndop,  rndarg,     shift,  bt
--  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [coeffs+8], 11
--  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [coeffs],   11,     .four
--  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .two
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .one
-+  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [ebx + coeffs+8 wrt ..gotoff], 11
-+  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .four
-+  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .two
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .one
-   IDCT0         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT0         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT0         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1099,8 +1108,8 @@ simple_idct_mmx_P:
- 
- ALIGN 16
- .four
--  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .six
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .five
-+  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .six
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .five
-   IDCT4         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT4         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT4         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1109,7 +1118,7 @@ ALIGN 16
- 
- ALIGN 16
- .six
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .seven
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .seven
-   IDCT6         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT6         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT6         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1118,7 +1127,7 @@ ALIGN 16
- 
- ALIGN 16
- .two
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .three
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .three
-   IDCT2         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT2         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT2         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1159,6 +1168,7 @@ ALIGN 16
- .ret
-   add esp, 128
- 
-+  pop ebx
-   ret
- .endfunc
- 
-@@ -1174,15 +1184,19 @@ ALIGN 16
- 
- ALIGN 16
- simple_idct_mmx:
-+  push ebx
-+  call get_pc.bx
-+  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-   sub esp, 128
--  mov edx, [esp+128+4]
-+  mov edx, [esp+128+4+4]
-   PERMUTEP edx			; permute parm list in place
- 
- ;               src0,   src4,   src1,   src5,   dst,    rndop,  rndarg,     shift,  bt
--  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [coeffs+8], 11
--  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [coeffs],   11,     .fourP
--  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .twoP
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .oneP
-+  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [ebx + coeffs+8 wrt ..gotoff], 11
-+  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .fourP
-+  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .twoP
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .oneP
-   IDCT0         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT0         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT0         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1191,8 +1205,8 @@ simple_idct_mmx:
- 
- ALIGN 16
- .fourP
--  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .sixP
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .fiveP
-+  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .sixP
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .fiveP
-   IDCT4         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT4         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT4         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1201,7 +1215,7 @@ ALIGN 16
- 
- ALIGN 16
- .sixP
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .sevenP
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .sevenP
-   IDCT6         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT6         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT6         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1210,7 +1224,7 @@ ALIGN 16
- 
- ALIGN 16
- .twoP
--  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .threeP
-+  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .threeP
-   IDCT2         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
-   IDCT2         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
-   IDCT2         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
-@@ -1251,6 +1265,7 @@ ALIGN 16
- .retP
-   add esp, 128
- 
-+  pop ebx
-   ret
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/colorspace_mmx.inc xvidcore-1.1.2/src/image/x86_asm/colorspace_mmx.inc
---- xvidcore-1.1.2-old/src/image/x86_asm/colorspace_mmx.inc	2004-08-29 12:02:38.000000000 +0200
-+++ xvidcore-1.1.2/src/image/x86_asm/colorspace_mmx.inc	2007-01-27 16:34:11.000000000 +0100
-@@ -56,11 +56,14 @@ NAME:
-   push edi    ;   esp + localsize + 4
-   push ebp    ;   esp + localsize + 0
- 
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- %define x_dif           esp + localsize - 4
- %define y_dif           esp + localsize - 8
- %define uv_dif          esp + localsize - 12
- %define fixed_width     esp + localsize - 16
--%define tmp_height      esp + localsize - 20
-+%define tmp_fixed_width esp + localsize - 20
- 
-     sub esp, localsize
- 
-@@ -90,8 +93,6 @@ NAME:
-   mov esi, [y_ptr]          ; $esi$ = y_ptr
-   mov edi, [x_ptr]          ; $edi$ = x_ptr
-   mov edx, [x_stride]       ; $edx$ = x_stride
--  mov ebp, [height]         ; $ebp$ = height
--
- 
-   mov ebx, [vflip]
-   or ebx, ebx
-@@ -106,7 +107,7 @@ NAME:
-   sub ebx, edx
-   mov [x_dif], ebx          ; x_dif = -BYTES*fixed_width - x_stride
- 
--  mov eax, ebp
-+  mov eax, [height]
-   sub eax, 1
-   push edx                
-   mul edx
-@@ -126,8 +127,8 @@ NAME:
-   FUNC %+ _INIT ARG1, ARG2  ; call FUNC_INIT
- 
- .y_loop
--  mov [tmp_height], ebp
--  mov ebp, [fixed_width]
-+  push dword [fixed_width]
-+  pop dword [tmp_fixed_width]
- 
- .x_loop
-   FUNC ARG1, ARG2           ; call FUNC
-@@ -137,10 +138,9 @@ NAME:
-   add ebx, PIXELS/2         ; u_ptr += PIXELS/2
-   add ecx, PIXELS/2         ; v_ptr += PIXELS/2
-         
--  sub ebp, PIXELS           ; $ebp$ -= PIXELS
-+  sub dword [tmp_fixed_width], PIXELS           ; $ebp$ -= PIXELS
-   jg .x_loop                ; if ($ebp$ > 0) goto .x_loop
- 
--  mov ebp, [tmp_height]
-   add edi, [x_dif]          ; x_ptr += x_dif + (VPIXELS-1)*x_stride
-   add esi, [y_dif]          ; y_ptr += y_dif + (VPIXELS-1)*y_stride
- %rep VPIXELS-1
-@@ -155,7 +155,7 @@ NAME:
-   add ecx, [uv_stride]
- %endrep
- 
--  sub ebp, VPIXELS          ; $ebp$ -= VPIXELS
-+  sub dword [height], VPIXELS          ; $ebp$ -= VPIXELS
-   jg .y_loop                ; if ($ebp$ > 0) goto .y_loop
- 
-   ; cleanup stack & undef everything
-@@ -181,7 +181,6 @@ NAME:
- %undef y_dif
- %undef uv_dif
- %undef fixed_width
--%undef tmp_height
-         ret
- .endfunc
- %undef NAME
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/colorspace_rgb_mmx.asm xvidcore-1.1.2/src/image/x86_asm/colorspace_rgb_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/colorspace_rgb_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/colorspace_rgb_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -120,7 +120,7 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
- ;------------------------------------------------------------------------------
- 
- %macro BGR_TO_YV12_INIT		2
--  movq mm7, [y_mul]
-+  movq mm7, [ebp + y_mul wrt ..gotoff]
- %endmacro
- 
- 
-@@ -184,8 +184,8 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
- 
-   ; u_ptr, v_ptr
-   movq mm0, mm6                 ; = [  |b4|g4|r4]
--  pmaddwd mm6, [v_mul]          ; *= V_MUL
--  pmaddwd mm0, [u_mul]          ; *= U_MUL
-+  pmaddwd mm6, [ebp + v_mul wrt ..gotoff]          ; *= V_MUL
-+  pmaddwd mm0, [ebp + u_mul wrt ..gotoff]          ; *= U_MUL
-   movq mm1, mm0
-   movq mm2, mm6
-   psrlq mm1, 32
-@@ -230,30 +230,30 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
-   movd mm3, [ecx]           ; v_ptr[0]
-   punpcklbw mm2, mm7        ; u3u2u1u0 -> mm2
-   punpcklbw mm3, mm7        ; v3v2v1v0 -> mm3
--  psubsw mm2, [U_SUB]       ; U - 128
--  psubsw mm3, [V_SUB]       ; V - 128
-+  psubsw mm2, [ebp + U_SUB wrt ..gotoff]       ; U - 128
-+  psubsw mm3, [ebp + V_SUB wrt ..gotoff]       ; V - 128
-   movq mm4, mm2
-   movq mm5, mm3
--  pmullw mm2, [UG_MUL]
--  pmullw mm3, [VG_MUL]
-+  pmullw mm2, [ebp + UG_MUL wrt ..gotoff]
-+  pmullw mm3, [ebp + VG_MUL wrt ..gotoff]
-   movq mm6, mm2             ; u3u2u1u0 -> mm6
-   punpckhwd mm2, mm2        ; u3u3u2u2 -> mm2
-   punpcklwd mm6, mm6        ; u1u1u0u0 -> mm6
--  pmullw mm4, [UB_MUL]      ; B_ADD -> mm4
-+  pmullw mm4, [ebp + UB_MUL wrt ..gotoff]      ; B_ADD -> mm4
-   movq mm0, mm3
-   punpckhwd mm3, mm3        ; v3v3v2v2 -> mm2
-   punpcklwd mm0, mm0        ; v1v1v0v0 -> mm6
-   paddsw mm2, mm3
-   paddsw mm6, mm0
--  pmullw mm5, [VR_MUL]      ; R_ADD -> mm5
-+  pmullw mm5, [ebp + VR_MUL wrt ..gotoff]      ; R_ADD -> mm5
-   movq mm0, [esi]           ; y7y6y5y4y3y2y1y0 -> mm0
-   movq mm1, mm0
-   punpckhbw mm1, mm7        ; y7y6y5y4 -> mm1
-   punpcklbw mm0, mm7        ; y3y2y1y0 -> mm0
--  psubsw mm0, [Y_SUB]       ; Y - Y_SUB
--  psubsw mm1, [Y_SUB]       ; Y - Y_SUB
--  pmullw mm1, [Y_MUL]
--  pmullw mm0, [Y_MUL]
-+  psubsw mm0, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
-+  psubsw mm1, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
-+  pmullw mm1, [ebp + Y_MUL wrt ..gotoff]
-+  pmullw mm0, [ebp + Y_MUL wrt ..gotoff]
-   movq [TEMP_Y2], mm1       ; y7y6y5y4 -> mm3
-   movq [TEMP_Y1], mm0       ; y3y2y1y0 -> mm7
-   psubsw mm1, mm2           ; g7g6g5g4 -> mm1
-@@ -266,10 +266,10 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
-   movq mm1, mm0
-   punpckhbw mm1, mm7        ; y7y6y5y4 -> mm1
-   punpcklbw mm0, mm7        ; y3y2y1y0 -> mm0
--  psubsw mm0, [Y_SUB]       ; Y - Y_SUB
--  psubsw mm1, [Y_SUB]       ; Y - Y_SUB
--  pmullw mm1, [Y_MUL]
--  pmullw mm0, [Y_MUL]
-+  psubsw mm0, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
-+  psubsw mm1, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
-+  pmullw mm1, [ebp + Y_MUL wrt ..gotoff]
-+  pmullw mm0, [ebp + Y_MUL wrt ..gotoff]
-   movq mm3, mm1
-   psubsw mm1, mm2           ; g7g6g5g4 -> mm1
-   movq mm2, mm0
-@@ -419,6 +419,11 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+  mov ebp, [esp]
-+  retn
-+
- %include "colorspace_mmx.inc"
- 
- ; input
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/colorspace_yuyv_mmx.asm xvidcore-1.1.2/src/image/x86_asm/colorspace_yuyv_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/colorspace_yuyv_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/colorspace_yuyv_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -76,7 +76,7 @@ mmx_one:    dw 1, 1, 1, 1
- ;-----------------------------------------------------------------------------
- 
- %macro YUYV_TO_YV12_INIT		2
--  movq mm7, [yuyv_mask]
-+  movq mm7, [ebp + yuyv_mask wrt ..gotoff]
- %endmacro
- 
- 
-@@ -108,8 +108,8 @@ mmx_one:    dw 1, 1, 1, 1
-   pand mm5, mm7
-   pand mm6, mm7
-   paddw mm5, mm6
--  paddw mm4, [mmx_one]      ; +1 rounding
--  paddw mm5, [mmx_one]      ;
-+  paddw mm4, [ebp + mmx_one wrt ..gotoff]      ; +1 rounding
-+  paddw mm5, [ebp + mmx_one wrt ..gotoff]      ;
-   psrlw mm4, 1
-   psrlw mm5, 1
- ;---[ 3dnow/xmm ]----------------------------------------------------
-@@ -310,6 +310,11 @@ mmx_one:    dw 1, 1, 1, 1
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+  mov ebp, [esp]
-+  retn
-+
- %include "colorspace_mmx.inc"
- 
- ; input
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dn.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dn.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dn.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dn.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -44,20 +44,6 @@ BITS 32
- %endmacro
- 
- ;=============================================================================
--; Read Only data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--mmx_one:
--	times 8 db 1
--
--;=============================================================================
- ; Code
- ;=============================================================================
- 
-@@ -132,7 +118,10 @@ interpolate8x8_halfpel_h_3dn:
- 
- .rounding1
-   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   COPY_H_3DN_RND1
-   lea ecx, [ecx+2*edx]
-   COPY_H_3DN_RND1
-@@ -206,7 +195,10 @@ interpolate8x8_halfpel_v_3dn:
- 
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   movq mm2, [eax]       ; loop invariant
-   add eax, edx
- 
-@@ -329,7 +321,10 @@ interpolate8x8_halfpel_hv_3dn
-   mov eax, [esp+ 8] ; Src
-   mov edx, [esp+12] ; stride
- 
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
- 
-     ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
-   movq mm2, [eax]
-@@ -387,7 +382,10 @@ interpolate8x4_halfpel_h_3dn:
- 
- .rounding1
-   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   COPY_H_3DN_RND1
-   lea ecx, [ecx+2*edx]
-   COPY_H_3DN_RND1
-@@ -424,7 +422,10 @@ interpolate8x4_halfpel_v_3dn:
- 
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   movq mm2, [eax]       ; loop invariant
-   add eax, edx
- 
-@@ -462,7 +463,10 @@ interpolate8x4_halfpel_hv_3dn
-   mov eax, [esp+ 8] ; Src
-   mov edx, [esp+12] ; stride
- 
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
- 
-     ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
-   movq mm2, [eax]
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dne.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dne.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dne.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dne.asm	2007-01-27 17:25:51.000000000 +0100
-@@ -45,24 +45,6 @@ BITS 32
- %endmacro
- 
- ;=============================================================================
--; Read only data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--mmx_one:
--	times 8 db 1
--
--ALIGN 8
--mm_minusone:
--	dd -1,-1
--
--;=============================================================================
- ; Macros
- ;=============================================================================
- 
-@@ -149,7 +131,10 @@ interpolate8x8_halfpel_h_3dne:
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-   mov ecx, [esp+ 4] ; Dst
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   COPY_H_SSE_RND1
-   lea ecx, [ecx+2*edx]
-   COPY_H_SSE_RND1
-@@ -223,15 +208,15 @@ ALIGN 8
-   psubusb mm0, [eax]
-   add eax, edx
-   mov ecx, [esp+ 4] ; Dst
--  push esi
-+  push byte -1
-+  push byte -1
-   pcmpeqb mm1, mm1
-   pcmpeqb mm2, mm2
--  mov esi, mm_minusone
-   psubusb mm1, [byte eax]
-   psubusb mm2, [eax+edx]
-   lea eax, [eax+2*edx]
--  movq mm6, [esi]
--  movq mm7, [esi]
-+  movq mm6, [esp]
-+  movq mm7, [esp]
-   pavgb mm0, mm1
-   pavgb mm1, mm2
-   psubusb mm6, mm0
-@@ -246,8 +231,8 @@ ALIGN 8
-   lea eax, [eax+2*edx]
-   pavgb mm2, mm3
-   pavgb mm3, mm4
--  movq mm0, [esi]
--  movq mm1, [esi]
-+  movq mm0, [esp]
-+  movq mm1, [esp]
-   psubusb mm0, mm2
-   psubusb mm1, mm3
-   movq [ecx], mm0
-@@ -261,8 +246,8 @@ ALIGN 8
-   lea eax, [eax+2*edx]
-   pavgb mm4, mm5
-   pavgb mm5, mm6
--  movq mm2, [esi]
--  movq mm3, [esi]
-+  movq mm2, [esp]
-+  movq mm3, [esp]
-   psubusb mm2, mm4
-   psubusb mm3, mm5
-   movq [ecx], mm2
-@@ -274,10 +259,10 @@ ALIGN 8
-   psubusb mm0, [eax+edx]
-   pavgb mm6, mm7
-   pavgb mm7, mm0
--  movq mm4, [esi]
--  movq mm5, [esi]
-+  movq mm4, [esp]
-+  movq mm5, [esp]
-   psubusb mm4, mm6
--  pop esi
-+  add esp, byte 8
-   psubusb mm5, mm7
-   movq [ecx], mm4
-   movq [ecx+edx], mm5
-@@ -391,7 +376,10 @@ interpolate8x8_halfpel_hv_3dne:
-   pavgb mm2, mm3
-   pxor mm3, mm6         ; mm2/mm3 ready
-   mov ecx, [esp+ 4]     ; Dst
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
- 
-   jz near .rounding1
-   lea ebp,[byte ebp]
-@@ -443,7 +431,10 @@ interpolate8x4_halfpel_h_3dne:
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-   mov ecx, [esp+ 4] ; Dst
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   COPY_H_SSE_RND1
-   lea ecx, [ecx+2*edx]
-   COPY_H_SSE_RND1
-@@ -501,16 +492,15 @@ ALIGN 8
-   add eax, edx                  ; eax==line1
-   mov ecx, [esp+ 4] ; Dst
- 
--  push esi
--
-   pcmpeqb mm1, mm1
-   pcmpeqb mm2, mm2
--  mov esi, mm_minusone
-+  push byte -1
-+  push byte -1
-   psubusb mm1, [byte eax]       ; line1
-   psubusb mm2, [eax+edx]        ; line2
-   lea eax, [eax+2*edx]          ; eax==line3
--  movq mm6, [esi]
--  movq mm7, [esi]
-+  movq mm6, [esp]
-+  movq mm7, [esp]
-   pavgb mm0, mm1
-   pavgb mm1, mm2
-   psubusb mm6, mm0
-@@ -526,15 +516,13 @@ ALIGN 8
-   lea eax, [eax+2*edx]          ; eax==line 5
-   pavgb mm2, mm3
-   pavgb mm3, mm4
--  movq mm0, [esi]
--  movq mm1, [esi]
-+  movq mm0, [esp]
-+  movq mm1, [esp]
-   psubusb mm0, mm2
-   psubusb mm1, mm3
-   movq [ecx], mm0
-   movq [ecx+edx], mm1
- 
--  pop esi
--
-   ret
- 
- .endfunc
-@@ -562,7 +550,10 @@ interpolate8x4_halfpel_hv_3dne:
-   pavgb mm2, mm3
-   pxor mm3, mm6         ; mm2/mm3 ready
-   mov ecx, [esp+ 4]     ; Dst
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  lea esp, [esp + 8]
- 
-   jz near .rounding1
-   lea ebp,[byte ebp]
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_mmx.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -166,13 +166,17 @@ interpolate8x8_halfpel_h_mmx:
- 
-   push esi
-   push edi
--  mov eax, [esp + 8 + 16]       ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
- 
--  movq mm7, [rounding1_mmx + eax * 8]
-+  mov eax, [esp + 12 + 16]       ; rounding
- 
--  mov edi, [esp + 8 + 4]        ; dst
--  mov esi, [esp + 8 + 8]        ; src
--  mov edx, [esp + 8 + 12]       ; stride
-+  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
-+
-+  mov edi, [esp + 12 + 4]        ; dst
-+  mov esi, [esp + 12 + 8]        ; src
-+  mov edx, [esp + 12 + 12]       ; stride
- 
-   pxor mm6, mm6                 ; zero
- 
-@@ -185,6 +189,7 @@ interpolate8x8_halfpel_h_mmx:
-   COPY_H_MMX
-   COPY_H_MMX
- 
-+  pop ebp
-   pop edi
-   pop esi
- 
-@@ -225,13 +230,17 @@ interpolate8x8_halfpel_v_mmx:
-   push esi
-   push edi
- 
--  mov eax, [esp + 8 + 16]       ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 12 + 16]       ; rounding
- 
--  movq mm7, [rounding1_mmx + eax * 8]
-+  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
- 
--  mov edi, [esp + 8 + 4]        ; dst
--  mov esi, [esp + 8 + 8]        ; src
--  mov edx, [esp + 8 + 12]       ; stride
-+  mov edi, [esp + 12 + 4]        ; dst
-+  mov esi, [esp + 12 + 8]        ; src
-+  mov edx, [esp + 12 + 12]       ; stride
- 
-   pxor mm6, mm6                 ; zero
- 
-@@ -245,6 +254,7 @@ interpolate8x8_halfpel_v_mmx:
-   COPY_V_MMX
-   COPY_V_MMX
- 
-+  pop ebp
-   pop edi
-   pop esi
- 
-@@ -315,18 +325,22 @@ interpolate8x8_halfpel_hv_mmx:
-   push esi
-   push edi
- 
--  mov eax, [esp + 8 + 16]   ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
- 
--  movq mm7, [rounding2_mmx + eax * 8]
-+  mov eax, [esp + 12 + 16]   ; rounding
- 
--  mov edi, [esp + 8 + 4]    ; dst
--  mov esi, [esp + 8 + 8]    ; src
-+  movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff]
-+
-+  mov edi, [esp + 12 + 4]    ; dst
-+  mov esi, [esp + 12 + 8]    ; src
- 
-   mov eax, 8
- 
-   pxor mm6, mm6             ; zero
- 
--  mov edx, [esp + 8 + 12]   ; stride
-+  mov edx, [esp + 12 + 12]   ; stride
- 
-   COPY_HV_MMX
-   COPY_HV_MMX
-@@ -337,6 +351,7 @@ interpolate8x8_halfpel_hv_mmx:
-   COPY_HV_MMX
-   COPY_HV_MMX
- 
-+  pop ebp
-   pop edi
-   pop esi
- 
-@@ -357,13 +372,18 @@ interpolate8x4_halfpel_h_mmx:
- 
-   push esi
-   push edi
--  mov eax, [esp + 8 + 16]       ; rounding
- 
--  movq mm7, [rounding1_mmx + eax * 8]
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 12 + 16]       ; rounding
- 
--  mov edi, [esp + 8 + 4]        ; dst
--  mov esi, [esp + 8 + 8]        ; src
--  mov edx, [esp + 8 + 12]       ; stride
-+  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
-+
-+  mov edi, [esp + 12 + 4]        ; dst
-+  mov esi, [esp + 12 + 8]        ; src
-+  mov edx, [esp + 12 + 12]       ; stride
- 
-   pxor mm6, mm6                 ; zero
- 
-@@ -372,6 +392,7 @@ interpolate8x4_halfpel_h_mmx:
-   COPY_H_MMX
-   COPY_H_MMX
- 
-+  pop ebp
-   pop edi
-   pop esi
- 
-@@ -394,13 +415,17 @@ interpolate8x4_halfpel_v_mmx:
-   push esi
-   push edi
- 
--  mov eax, [esp + 8 + 16]       ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 12 + 16]       ; rounding
- 
--  movq mm7, [rounding1_mmx + eax * 8]
-+  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
- 
--  mov edi, [esp + 8 + 4]        ; dst
--  mov esi, [esp + 8 + 8]        ; src
--  mov edx, [esp + 8 + 12]       ; stride
-+  mov edi, [esp + 12 + 4]        ; dst
-+  mov esi, [esp + 12 + 8]        ; src
-+  mov edx, [esp + 12 + 12]       ; stride
- 
-   pxor mm6, mm6                 ; zero
- 
-@@ -410,6 +435,7 @@ interpolate8x4_halfpel_v_mmx:
-   COPY_V_MMX
-   COPY_V_MMX
- 
-+  pop ebp
-   pop edi
-   pop esi
- 
-@@ -433,24 +459,29 @@ interpolate8x4_halfpel_hv_mmx:
-   push esi
-   push edi
- 
--  mov eax, [esp + 8 + 16]   ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
- 
--  movq mm7, [rounding2_mmx + eax * 8]
-+  mov eax, [esp + 12 + 16]   ; rounding
- 
--  mov edi, [esp + 8 + 4]    ; dst
--  mov esi, [esp + 8 + 8]    ; src
-+  movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff]
-+
-+  mov edi, [esp + 12 + 4]    ; dst
-+  mov esi, [esp + 12 + 8]    ; src
- 
-   mov eax, 8
- 
-   pxor mm6, mm6             ; zero
- 
--  mov edx, [esp + 8 + 12]   ; stride
-+  mov edx, [esp + 12 + 12]   ; stride
- 
-   COPY_HV_MMX
-   COPY_HV_MMX
-   COPY_HV_MMX
-   COPY_HV_MMX
- 
-+  pop ebp
-   pop edi
-   pop esi
- 
-@@ -491,10 +522,10 @@ interpolate8x4_halfpel_hv_mmx:
- 
-   por mm3, mm6
- 
--  pand mm0, [mmx_mask]
--  pand mm1, [mmx_mask]
--  pand mm4, [mmx_mask]
--  pand mm5, [mmx_mask]
-+  pand mm0, [ebp + mmx_mask wrt ..gotoff]
-+  pand mm1, [ebp + mmx_mask wrt ..gotoff]
-+  pand mm4, [ebp + mmx_mask wrt ..gotoff]
-+  pand mm5, [ebp + mmx_mask wrt ..gotoff]
- 
-   psrlq mm0, 1              ; src1 / 2
-   psrlq mm1, 1              ; src2 / 2
-@@ -538,10 +569,10 @@ interpolate8x4_halfpel_hv_mmx:
- 
-   pand mm3, mm6
- 
--  pand mm0, [mmx_mask]
--  pand mm1, [mmx_mask]
--  pand mm4, [mmx_mask]
--  pand mm5, [mmx_mask]
-+  pand mm0, [ebp + mmx_mask wrt ..gotoff]
-+  pand mm1, [ebp + mmx_mask wrt ..gotoff]
-+  pand mm4, [ebp + mmx_mask wrt ..gotoff]
-+  pand mm5, [ebp + mmx_mask wrt ..gotoff]
- 
-   psrlq mm0, 1              ; src1 / 2
-   psrlq mm1, 1              ; src2 / 2
-@@ -567,21 +598,25 @@ interpolate8x8_avg2_mmx:
- 
-   push ebx
- 
--  mov eax, [esp + 4 + 20]   ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 8 + 20]   ; rounding
-   test eax, eax
- 
-   jnz near .rounding1
- 
--  mov eax, [esp + 4 + 24]   ; height -> eax
-+  mov eax, [esp + 8 + 24]   ; height -> eax
-   sub eax, 8
-   test eax, eax
- 
--  mov ecx, [esp + 4 + 4]    ; dst -> edi
--  mov eax, [esp + 4 + 8]    ; src1 -> esi
--  mov ebx, [esp + 4 + 12]   ; src2 -> eax
--  mov edx, [esp + 4 + 16]   ; stride -> edx
-+  mov ecx, [esp + 8 + 4]    ; dst -> edi
-+  mov eax, [esp + 8 + 8]    ; src1 -> esi
-+  mov ebx, [esp + 8 + 12]   ; src2 -> eax
-+  mov edx, [esp + 8 + 16]   ; stride -> edx
- 
--  movq mm7, [mmx_one]
-+  movq mm7, [ebp + mmx_one wrt ..gotoff]
- 
-   jz near .start0
- 
-@@ -602,16 +637,16 @@ interpolate8x8_avg2_mmx:
-   ret
- 
- .rounding1
--  mov eax, [esp + 4 + 24]       ; height -> eax
-+  mov eax, [esp + 8 + 24]       ; height -> eax
-   sub eax, 8
-   test eax, eax
- 
--  mov ecx, [esp + 4 + 4]        ; dst -> edi
--  mov eax, [esp + 4 + 8]        ; src1 -> esi
--  mov ebx, [esp + 4 + 12]       ; src2 -> eax
--  mov edx, [esp + 4 + 16]       ; stride -> edx
-+  mov ecx, [esp + 8 + 4]        ; dst -> edi
-+  mov eax, [esp + 8 + 8]        ; src1 -> esi
-+  mov ebx, [esp + 8 + 12]       ; src2 -> eax
-+  mov edx, [esp + 8 + 16]       ; stride -> edx
- 
--  movq mm7, [mmx_one]
-+  movq mm7, [ebp + mmx_one wrt ..gotoff]
- 
-   jz near .start1
- 
-@@ -628,6 +663,7 @@ interpolate8x8_avg2_mmx:
-   lea ecx, [ecx+2*edx]
-   AVG2_MMX_RND1
- 
-+  pop ebp
-   pop ebx
-   ret
- .endfunc
-@@ -652,11 +688,11 @@ interpolate8x8_avg2_mmx:
-   movq mm2, mm0
-   movq mm3, mm1
- 
--  pand mm2, [mmx_three]
--  pand mm3, [mmx_three]
-+  pand mm2, [ebp + mmx_three wrt ..gotoff]
-+  pand mm3, [ebp + mmx_three wrt ..gotoff]
- 
--  pand mm0, [mmx_mask2]
--  pand mm1, [mmx_mask2]
-+  pand mm0, [ebp + mmx_mask2 wrt ..gotoff]
-+  pand mm1, [ebp + mmx_mask2 wrt ..gotoff]
- 
-   psrlq mm0, 2
-   psrlq mm1, 2
-@@ -673,11 +709,11 @@ interpolate8x8_avg2_mmx:
-   movq mm1, mm4
-   movq mm3, mm5
- 
--  pand mm1, [mmx_three]
--  pand mm3, [mmx_three]
-+  pand mm1, [ebp + mmx_three wrt ..gotoff]
-+  pand mm3, [ebp + mmx_three wrt ..gotoff]
- 
--  pand mm4, [mmx_mask2]
--  pand mm5, [mmx_mask2]
-+  pand mm4, [ebp + mmx_mask2 wrt ..gotoff]
-+  pand mm5, [ebp + mmx_mask2 wrt ..gotoff]
- 
-   psrlq mm4, 2
-   psrlq mm5, 2
-@@ -688,8 +724,8 @@ interpolate8x8_avg2_mmx:
-   paddb mm1, mm3
-   paddb mm2, mm1
- 
--  paddb mm2, [mmx_two]
--  pand mm2, [mmx_mask2]
-+  paddb mm2, [ebp + mmx_two wrt ..gotoff]
-+  pand mm2, [ebp + mmx_mask2 wrt ..gotoff]
- 
-   psrlq mm2, 2
-   paddb mm0, mm2
-@@ -707,11 +743,11 @@ interpolate8x8_avg2_mmx:
-   movq mm2, mm0
-   movq mm3, mm1
- 
--  pand mm2, [mmx_three]
--  pand mm3, [mmx_three]
-+  pand mm2, [ebp + mmx_three wrt ..gotoff]
-+  pand mm3, [ebp + mmx_three wrt ..gotoff]
- 
--  pand mm0, [mmx_mask2]
--  pand mm1, [mmx_mask2]
-+  pand mm0, [ebp + mmx_mask2 wrt ..gotoff]
-+  pand mm1, [ebp + mmx_mask2 wrt ..gotoff]
- 
-   psrlq mm0, 2
-   psrlq mm1, 2
-@@ -728,11 +764,11 @@ interpolate8x8_avg2_mmx:
-   movq mm1, mm4
-   movq mm3, mm5
- 
--  pand mm1, [mmx_three]
--  pand mm3, [mmx_three]
-+  pand mm1, [ebp + mmx_three wrt ..gotoff]
-+  pand mm3, [ebp + mmx_three wrt ..gotoff]
- 
--  pand mm4, [mmx_mask2]
--  pand mm5, [mmx_mask2]
-+  pand mm4, [ebp + mmx_mask2 wrt ..gotoff]
-+  pand mm5, [ebp + mmx_mask2 wrt ..gotoff]
- 
-   psrlq mm4, 2
-   psrlq mm5, 2
-@@ -743,8 +779,8 @@ interpolate8x8_avg2_mmx:
-   paddb mm1, mm3
-   paddb mm2, mm1
- 
--  paddb mm2, [mmx_one]
--  pand mm2, [mmx_mask2]
-+  paddb mm2, [ebp + mmx_one wrt ..gotoff]
-+  pand mm2, [ebp + mmx_mask2 wrt ..gotoff]
- 
-   psrlq mm2, 2
-   paddb mm0, mm2
-@@ -762,18 +798,22 @@ interpolate8x8_avg4_mmx:
-   push edi
-   push esi
- 
--  mov eax, [esp + 12 + 28]      ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 16 + 28]      ; rounding
- 
-   test eax, eax
- 
--  mov ecx, [esp + 12 + 4]       ; dst -> edi
--  mov eax, [esp + 12 + 8]       ; src1 -> esi
--  mov ebx, [esp + 12 + 12]      ; src2 -> eax
--  mov esi, [esp + 12 + 16]      ; src3 -> esi
--  mov edi, [esp + 12 + 20]      ; src4 -> edi
--  mov edx, [esp + 12 + 24]      ; stride -> edx
-+  mov ecx, [esp + 16 + 4]       ; dst -> edi
-+  mov eax, [esp + 16 + 8]       ; src1 -> esi
-+  mov ebx, [esp + 16 + 12]      ; src2 -> eax
-+  mov esi, [esp + 16 + 16]      ; src3 -> esi
-+  mov edi, [esp + 16 + 20]      ; src4 -> edi
-+  mov edx, [esp + 16 + 24]      ; stride -> edx
- 
--  movq mm7, [mmx_one]
-+  movq mm7, [ebp + mmx_one wrt ..gotoff]
- 
-   jnz near .rounding1
- 
-@@ -815,6 +855,7 @@ interpolate8x8_avg4_mmx:
-   lea ecx, [ecx+edx]
-   AVG4_MMX_RND1
- 
-+  pop ebp
-   pop esi
-   pop edi
-   pop ebx
-@@ -868,8 +909,8 @@ interpolate8x8_avg4_mmx:
-   psubsw mm0, mm2
-   psubsw mm1, mm3
- 
--  pmullw mm0, [mmx_five]
--  pmullw mm1, [mmx_five]
-+  pmullw mm0, [ebp + mmx_five wrt ..gotoff]
-+  pmullw mm1, [ebp + mmx_five wrt ..gotoff]
- 
-   movq mm2, [eax-2]
-   movq mm4, [eax+3]
-@@ -903,13 +944,17 @@ interpolate8x8_avg4_mmx:
- ALIGN 16
- interpolate8x8_6tap_lowpass_h_mmx:
- 
--  mov eax, [esp + 16]           ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+  mov eax, [esp + 20]           ; rounding
- 
--  movq mm6, [rounding_lowpass_mmx + eax * 8]
-+  movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff]
- 
--  mov ecx, [esp + 4]            ; dst -> edi
--  mov eax, [esp + 8]            ; src -> esi
--  mov edx, [esp + 12]           ; stride -> edx
-+  mov ecx, [esp + 8]            ; dst -> edi
-+  mov eax, [esp + 12]            ; src -> esi
-+  mov edx, [esp + 16]           ; stride -> edx
- 
-   pxor mm7, mm7
- 
-@@ -929,6 +974,7 @@ interpolate8x8_6tap_lowpass_h_mmx:
-   lea ecx, [ecx+edx]
-   LOWPASS_6TAP_H_MMX
- 
-+  pop ebp
-   ret
- .endfunc
- 
-@@ -979,8 +1025,8 @@ interpolate8x8_6tap_lowpass_h_mmx:
-   psubsw mm0, mm2
-   psubsw mm1, mm3
- 
--  pmullw mm0, [mmx_five]
--  pmullw mm1, [mmx_five]
-+  pmullw mm0, [ebp + mmx_five wrt ..gotoff]
-+  pmullw mm1, [ebp + mmx_five wrt ..gotoff]
- 
-   movq mm2, [eax+edx]
-   movq mm4, [eax+2*ebx]
-@@ -1016,13 +1062,17 @@ interpolate8x8_6tap_lowpass_v_mmx:
- 
-   push ebx
- 
--  mov eax, [esp + 4 + 16]           ; rounding
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
- 
--  movq mm6, [rounding_lowpass_mmx + eax * 8]
-+  mov eax, [esp + 8 + 16]           ; rounding
- 
--  mov ecx, [esp + 4 + 4]            ; dst -> edi
--  mov eax, [esp + 4 + 8]            ; src -> esi
--  mov edx, [esp + 4 + 12]           ; stride -> edx
-+  movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff]
-+
-+  mov ecx, [esp + 8 + 4]            ; dst -> edi
-+  mov eax, [esp + 8 + 8]            ; src -> esi
-+  mov edx, [esp + 8 + 12]           ; stride -> edx
- 
-   mov ebx, edx
-   shl ebx, 1
-@@ -1046,6 +1096,7 @@ interpolate8x8_6tap_lowpass_v_mmx:
-   lea ecx, [ecx+edx]
-   LOWPASS_6TAP_V_MMX
- 
-+  pop ebp
-   pop ebx
-   ret
- .endfunc
-@@ -1066,12 +1117,17 @@ interpolate8x8_6tap_lowpass_v_mmx:
- 
- %macro PROLOG 2   ; %1: Rounder, %2 load Dst-Rounder
-   pxor mm6, mm6
--  movq mm7, [%1]    ; TODO: dangerous! (eax isn't checked)
-+  PROLOG0
-+
-+  push ebp
-+  call get_pc.bp
-+  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- %if %2
--  movq mm5, [rounding1_mmx]
-+  movq mm5, [ebp + rounding1_mmx wrt ..gotoff]
- %endif
- 
--  PROLOG0
-+  movq mm7, [ebp + %1 wrt ..gotoff]    ; TODO: dangerous! (eax isn't checked)
- %endmacro
- 
-   ; performs: mm0 == (mm0+mm2)  mm1 == (mm1+mm3)
-@@ -1160,6 +1216,7 @@ interpolate8x8_halfpel_add_mmx:
-   ADD_FF_MMX 1
-   ADD_FF_MMX 1
-   ADD_FF_MMX 0
-+  pop ebp
-   ret
- .endfunc
- 
-@@ -1206,6 +1263,7 @@ interpolate8x8_halfpel_h_add_mmx:
-   ADD_FH_MMX
-   lea ecx,[ecx+edx]
-   ADD_FH_MMX
-+  pop ebp
-   ret
- .endfunc
- 
-@@ -1253,6 +1311,7 @@ interpolate8x8_halfpel_v_add_mmx:
-   ADD_HF_MMX
-   lea ecx,[ecx+edx]
-   ADD_HF_MMX
-+  pop ebp
-   ret
- .endfunc
- 
-@@ -1318,8 +1377,8 @@ interpolate8x8_halfpel_v_add_mmx:
-   paddusw mm0, mm4  ; mix Src(mm0/mm1) with Dst(mm2/mm3)
-   paddusw mm1, mm5
- 
--  paddusw mm0, [rounding1_mmx]
--  paddusw mm1, [rounding1_mmx]
-+  paddusw mm0, [ebp + rounding1_mmx wrt ..gotoff]
-+  paddusw mm1, [ebp + rounding1_mmx wrt ..gotoff]
- 
-   psrlw mm0, 1
-   psrlw mm1, 1
-@@ -1329,6 +1388,11 @@ interpolate8x8_halfpel_v_add_mmx:
-   movq [ecx], mm0
- %endmacro
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+  mov ebp, [esp]
-+  retn
-+
- ALIGN 16
- interpolate8x8_halfpel_hv_add_mmx:
-   PROLOG rounding2_mmx, 0    ; mm5 is busy. Don't load dst-rounder
-@@ -1364,6 +1428,7 @@ interpolate8x8_halfpel_hv_add_mmx:
-   lea ecx,[ecx+edx]
-   ADD_HH_MMX
- 
-+  pop ebp
-   ret
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_xmm.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_xmm.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_xmm.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_xmm.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -42,20 +42,6 @@ BITS 32
- 	%endif
- %endmacro
- 
--;=============================================================================
--; Read only data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--mmx_one:
--	times 8 db 1
--
- SECTION .text
- 
- cglobal interpolate8x8_halfpel_h_xmm
-@@ -132,7 +118,10 @@ interpolate8x8_halfpel_h_xmm:
- 
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   COPY_H_SSE_RND1
-   lea ecx, [ecx+2*edx]
-   COPY_H_SSE_RND1
-@@ -204,7 +193,10 @@ interpolate8x8_halfpel_v_xmm:
- 
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   movq mm2, [eax]       ; loop invariant
-   add eax, edx
- 
-@@ -326,7 +318,10 @@ interpolate8x8_halfpel_hv_xmm:
-   mov eax, [esp+ 8]  ; Src
-   mov edx, [esp+12]  ; stride
- 
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
- 
-     ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
-   movq mm2, [eax]
-@@ -384,7 +379,10 @@ interpolate8x4_halfpel_h_xmm:
- 
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   COPY_H_SSE_RND1
-   lea ecx, [ecx+2*edx]
-   COPY_H_SSE_RND1
-@@ -419,7 +417,10 @@ interpolate8x4_halfpel_v_xmm:
- 
- .rounding1
-  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
-   movq mm2, [eax]       ; loop invariant
-   add eax, edx
- 
-@@ -458,7 +459,10 @@ interpolate8x4_halfpel_hv_xmm:
-   mov eax, [esp+ 8]  ; Src
-   mov edx, [esp+12]  ; stride
- 
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
- 
-     ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
-   movq mm2, [eax]
-@@ -583,8 +587,8 @@ interpolate8x8_halfpel_add_xmm:  ; 23c
-     pxor mm2, mm4
-     pavgb mm1, mm3
-     pxor mm3, mm5
--    pand mm2, [mmx_one]
--    pand mm3, [mmx_one]
-+    pand mm2, [esp]
-+    pand mm3, [esp]
-     psubb mm0, mm2
-     psubb mm1, mm3
-     pavgb mm0, [ecx+%1]
-@@ -612,6 +616,8 @@ interpolate8x8_halfpel_h_add_xmm:   ; 32
- .Loop1
-   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-   ; movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-   ADD_FH_RND1 0, edx
-   lea eax,[eax+2*edx]
-   lea ecx,[ecx+2*edx]
-@@ -622,6 +628,7 @@ interpolate8x8_halfpel_h_add_xmm:   ; 32
-   lea eax,[eax+2*edx]
-   lea ecx,[ecx+2*edx]
-   ADD_FH_RND1 0, edx
-+  add esp, byte 8
-   EPILOG
- .endfunc
- 
-@@ -686,7 +693,10 @@ interpolate8x8_halfpel_v_add_xmm:
- 
- .Loop1
-   movq mm0, [eax] ; loop invariant
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
-+  add esp, byte 8
- 
-   ADD_8_HF_RND1 
-   movq mm0, mm2
-@@ -809,7 +819,9 @@ ALIGN 16
- interpolate8x8_halfpel_hv_add_xmm:
-   PROLOG1
- 
--  movq mm7, [mmx_one]
-+  push dword 0x01010101
-+  push dword 0x01010101
-+  movq mm7, [esp]
- 
-     ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
-   movq mm2, [eax] 
-@@ -838,6 +850,7 @@ interpolate8x8_halfpel_hv_add_xmm:
-   add ecx, edx
-   ADD_HH_RND1
- 
-+  add esp, byte 8
-   EPILOG
- .endfunc
- 
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_mmx.asm xvidcore-1.1.2/src/image/x86_asm/postprocessing_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/postprocessing_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -70,6 +70,11 @@ mmx_offset:
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+  mov ebp, [esp]
-+  retn
-+
- cglobal image_brightness_mmx
- 
- 
-@@ -83,16 +88,19 @@ image_brightness_mmx:
- 	push esi
- 	push edi
- 
--	movq mm6, [mmx_0x80]
--
- 	mov eax, [esp+8+20] ; offset
--	movq mm7, [mmx_offset + (eax + 128)*8]   ; being lazy
--
- 	mov edx, [esp+8+4]  ; Dst
- 	mov ecx, [esp+8+8]  ; stride
- 	mov esi, [esp+8+12] ; width
- 	mov edi, [esp+8+16] ; height
- 
-+	push ebp
-+	call get_pc.bp
-+	add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+	movq mm6, [ebp + mmx_0x80 wrt ..gotoff]
-+	movq mm7, [ebp + (eax + 128)*8 + mmx_offset wrt ..gotoff]   ; being lazy
-+	pop ebp
-+
- .yloop
- 	xor	eax, eax
- 
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_sse2.asm xvidcore-1.1.2/src/image/x86_asm/postprocessing_sse2.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_sse2.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/postprocessing_sse2.asm	2007-01-27 19:50:10.000000000 +0100
-@@ -42,19 +42,6 @@ BITS 32
- 	%endif
- %endmacro
- 
--;===========================================================================
--; read only data
--;===========================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--xmm_0x80:
--	times 16 db 0x80
--
- ;=============================================================================
- ; Code
- ;=============================================================================
-@@ -69,21 +56,9 @@ cglobal image_brightness_sse2
- 
- %macro CREATE_OFFSET_VECTOR 2
-   mov [%1 +  0], %2
--  mov [%1 +  1], %2
--  mov [%1 +  2], %2
--  mov [%1 +  3], %2
-   mov [%1 +  4], %2
--  mov [%1 +  5], %2
--  mov [%1 +  6], %2
--  mov [%1 +  7], %2
-   mov [%1 +  8], %2
--  mov [%1 +  9], %2
--  mov [%1 + 10], %2
--  mov [%1 + 11], %2
-   mov [%1 + 12], %2
--  mov [%1 + 13], %2
--  mov [%1 + 14], %2
--  mov [%1 + 15], %2
- %endmacro
- 
- ALIGN 16
-@@ -93,15 +68,17 @@ image_brightness_sse2:
-   push edi    ; 8 bytes offset for push
-   sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16)
- 
--  movdqa xmm6, [xmm_0x80]
--
-   ; Create a offset...offset vector
--  mov eax, [esp+8+32+20] ; brightness offset value	
--  mov edx, esp           ; edx will be esp aligned mod 16
--  add edx, 15            ; edx = esp + 15
--  and edx, ~15           ; edx = (esp + 15)&(~15)
--  CREATE_OFFSET_VECTOR edx, al
--  movdqa xmm7, [edx]
-+  movzx eax, byte [esp+8+32+20] ; brightness offset value
-+  mov ecx, esp           ; ecx will be esp aligned mod 16
-+  mov edx, 0x01010101
-+  add ecx, 15            ; ecx = esp + 15
-+  mul edx
-+  and ecx, ~15           ; ecx = (esp + 15)&(~15)
-+  CREATE_OFFSET_VECTOR ecx, dword 0x80808080
-+  movdqa xmm6, [ecx]
-+  CREATE_OFFSET_VECTOR ecx, eax
-+  movdqa xmm7, [ecx]
- 
-   mov edx, [esp+8+32+4]  ; Dst
-   mov ecx, [esp+8+32+8]  ; stride
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/qpel_mmx.asm xvidcore-1.1.2/src/image/x86_asm/qpel_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/qpel_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/qpel_mmx.asm	2007-01-27 17:51:30.000000000 +0100
-@@ -201,6 +201,11 @@ FIR_C23: times 4 dw 23
- 
- SECTION .text
- 
-+extern  _GLOBAL_OFFSET_TABLE_
-+get_pc.cx:
-+  mov ecx, [esp]
-+  retn
-+
- ;//////////////////////////////////////////////////////////////////////
- ;// Here we go with the Q-Pel mess.
- ;//  For horizontal passes, we process 4 *output* pixel in parallel
-@@ -208,22 +213,25 @@ SECTION .text
- ;//////////////////////////////////////////////////////////////////////
- 
- %macro PROLOG_NO_AVRG 0
-+  push ebx
-   push esi
-   push edi
-   push ebp
--  mov edi, [esp+16 + 0*4] ; Dst
--  mov esi, [esp+16 + 1*4] ; Src
--  mov ecx, [esp+16 + 2*4] ; Size
--  mov ebp, [esp+16 + 3*4] ; BpS
--  mov eax, [esp+16 + 4*4] ; Rnd
-+  mov edi, [esp+20 + 0*4] ; Dst
-+  mov esi, [esp+20 + 1*4] ; Src
-+  mov ebp, [esp+20 + 3*4] ; BpS
-+  mov eax, [esp+20 + 4*4] ; Rnd
-   and eax, 1
--  movq mm7, [Rounder_QP_MMX+eax*8]  ; rounder
-+  call get_pc.cx
-+  add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+  movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff]  ; rounder
- %endmacro
- 
- %macro EPILOG_NO_AVRG 0
-   pop ebp
-   pop edi
-   pop esi
-+  pop ebx
-   ret
- %endmacro
- 
-@@ -234,12 +242,13 @@ SECTION .text
-   push ebp
-   mov edi, [esp+20 + 0*4] ; Dst
-   mov esi, [esp+20 + 1*4] ; Src
--  mov ecx, [esp+20 + 2*4] ; Size
-   mov ebp, [esp+20 + 3*4] ; BpS
-   mov eax, [esp+20 + 4*4] ; Rnd
-   and eax, 1
--  movq mm7, [Rounder_QP_MMX+eax*8]  ; rounder
--  lea ebx, [Rounder1_MMX+eax*8]     ; *Rounder2
-+  call get_pc.cx
-+  add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+  movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff]  ; rounder
-+  lea ebx, [ecx + Rounder1_MMX+eax*8 wrt ..gotoff]     ; *Rounder2
- %endmacro
- 
- %macro EPILOG_AVRG 0
-@@ -261,23 +270,23 @@ SECTION .text
- %macro TLOAD 2     ; %1,%2: src pixels
-   movzx eax, byte [esi+%1]
-   movzx edx, byte [esi+%2]
--  movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ]
--  movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ]
-+  movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff]
-+  movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff]
-   paddw mm0, mm7
-   paddw mm3, mm7
- %endmacro
- 
- %macro TACCUM2 5   ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs
-   movzx eax, byte [esi+%1]
--  paddw %4, [%2 + eax*8]
--  paddw %5, [%3 + eax*8]
-+  paddw %4, [eax*8 + %2]
-+  paddw %5, [eax*8 + %3]
- %endmacro
- 
- %macro TACCUM3 7   ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs
-   movzx eax, byte [esi+%1]
--  paddw %5, [%2 + eax*8]
--  paddw %6, [%3 + eax*8]
--  paddw %7, [%4 + eax*8]
-+  paddw %5, [eax*8 + %2]
-+  paddw %6, [eax*8 + %3]
-+  paddw %7, [eax*8 + %4]
- %endmacro
- 
- ;//////////////////////////////////////////////////////////////////////
-@@ -287,32 +296,32 @@ SECTION .text
- %macro LOAD 2     ; %1,%2: src pixels
-   movzx eax, byte [esi+%1]
-   movzx edx, byte [esi+%2]
--  movq mm0, [xvid_Expand_mmx + eax*8]
--  movq mm3, [xvid_Expand_mmx + edx*8]
--  pmullw mm0, [FIR_R0 ]
--  pmullw mm3, [FIR_R16]
-+  movq mm0, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
-+  movq mm3, [ecx + xvid_Expand_mmx + edx*8 wrt ..gotoff]
-+  pmullw mm0, [ecx + FIR_R0  wrt ..gotoff]
-+  pmullw mm3, [ecx + FIR_R16 wrt ..gotoff]
-   paddw mm0, mm7
-   paddw mm3, mm7
- %endmacro
- 
- %macro ACCUM2 4   ;src pixel/Taps/dst regs #1-#2
-   movzx eax, byte [esi+%1]
--  movq mm4, [xvid_Expand_mmx + eax*8]
-+  movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
-   movq mm5, mm4
-   pmullw mm4, [%2]
--  pmullw mm5, [%2+8]
-+  pmullw mm5, [8+%2]
-   paddw %3, mm4
-   paddw %4, mm5
- %endmacro
- 
- %macro ACCUM3 5   ;src pixel/Taps/dst regs #1-#2-#3
-   movzx eax, byte [esi+%1]
--  movq mm4, [xvid_Expand_mmx + eax*8]
-+  movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
-   movq mm5, mm4
-   movq mm6, mm5
--  pmullw mm4, [%2   ]
--  pmullw mm5, [%2+ 8]
--  pmullw mm6, [%2+16]
-+  pmullw mm4, [   %2]
-+  pmullw mm5, [ 8+%2]
-+  pmullw mm6, [16+%2]
-   paddw %3, mm4
-   paddw %4, mm5
-   paddw %5, mm6
-@@ -359,23 +368,23 @@ SECTION .text
-   movq mm1, mm7
-   movq mm2, mm7
- 
--  ACCUM2 1,    FIR_R1, mm0, mm1
--  ACCUM2 2,    FIR_R2, mm0, mm1
--  ACCUM2 3,    FIR_R3, mm0, mm1
--  ACCUM2 4,    FIR_R4, mm0, mm1
--
--  ACCUM3 5,    FIR_R5, mm0, mm1, mm2
--  ACCUM3 6,    FIR_R6, mm0, mm1, mm2
--  ACCUM3 7,    FIR_R7, mm0, mm1, mm2
--  ACCUM2 8,    FIR_R8, mm1, mm2
--  ACCUM3 9,    FIR_R9, mm1, mm2, mm3
--  ACCUM3 10,   FIR_R10,mm1, mm2, mm3
--  ACCUM3 11,   FIR_R11,mm1, mm2, mm3
--
--  ACCUM2 12,   FIR_R12, mm2, mm3
--  ACCUM2 13,   FIR_R13, mm2, mm3
--  ACCUM2 14,   FIR_R14, mm2, mm3
--  ACCUM2 15,   FIR_R15, mm2, mm3
-+  ACCUM2 1,    ecx + FIR_R1 wrt ..gotoff, mm0, mm1
-+  ACCUM2 2,    ecx + FIR_R2 wrt ..gotoff, mm0, mm1
-+  ACCUM2 3,    ecx + FIR_R3 wrt ..gotoff, mm0, mm1
-+  ACCUM2 4,    ecx + FIR_R4 wrt ..gotoff, mm0, mm1
-+
-+  ACCUM3 5,    ecx + FIR_R5 wrt ..gotoff, mm0, mm1, mm2
-+  ACCUM3 6,    ecx + FIR_R6 wrt ..gotoff, mm0, mm1, mm2
-+  ACCUM3 7,    ecx + FIR_R7 wrt ..gotoff, mm0, mm1, mm2
-+  ACCUM2 8,    ecx + FIR_R8 wrt ..gotoff, mm1, mm2
-+  ACCUM3 9,    ecx + FIR_R9 wrt ..gotoff, mm1, mm2, mm3
-+  ACCUM3 10,   ecx + FIR_R10 wrt ..gotoff,mm1, mm2, mm3
-+  ACCUM3 11,   ecx + FIR_R11 wrt ..gotoff,mm1, mm2, mm3
-+
-+  ACCUM2 12,   ecx + FIR_R12 wrt ..gotoff, mm2, mm3
-+  ACCUM2 13,   ecx + FIR_R13 wrt ..gotoff, mm2, mm3
-+  ACCUM2 14,   ecx + FIR_R14 wrt ..gotoff, mm2, mm3
-+  ACCUM2 15,   ecx + FIR_R15 wrt ..gotoff, mm2, mm3
- 
- %else
- 
-@@ -383,25 +392,25 @@ SECTION .text
-   movq mm1, mm7
-   movq mm2, mm7
- 
--  TACCUM2 1,    xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1
--  TACCUM2 2,    xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1
--  TACCUM2 3,    xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1
--  TACCUM2 4,    xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1
--
--  TACCUM3 5,    xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0  , mm0, mm1, mm2
--  TACCUM3 6,    xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0  , mm0, mm1, mm2
--  TACCUM3 7,    xvid_FIR_0_0_0_1  , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0  , mm0, mm1, mm2
--
--  TACCUM2 8,                       xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 ,      mm1, mm2
--
--  TACCUM3 9,                       xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0,  mm1, mm2, mm3
--  TACCUM3 10,                      xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0,  mm1, mm2, mm3
--  TACCUM3 11,                      xvid_FIR_0_0_0_1  , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0,  mm1, mm2, mm3
--
--  TACCUM2 12,  xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3
--  TACCUM2 13,  xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3
--  TACCUM2 14,  xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3
--  TACCUM2 15,  xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3
-+  TACCUM2 1,   ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1
-+  TACCUM2 2,   ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1
-+  TACCUM2 3,   ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1
-+  TACCUM2 4,   ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff, mm0, mm1
-+
-+  TACCUM3 5,   ecx + xvid_FIR_0_1_3_6 wrt ..gotoff  , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff  , mm0, mm1, mm2
-+  TACCUM3 6,   ecx + xvid_FIR_0_0_1_3 wrt ..gotoff  , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff  , mm0, mm1, mm2
-+  TACCUM3 7,   ecx + xvid_FIR_0_0_0_1 wrt ..gotoff  , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff  , mm0, mm1, mm2
-+
-+  TACCUM2 8,   ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , xvid_FIR_20_6_3_1 wrt ..gotoff ,      mm1, mm2
-+
-+  TACCUM3 9,   ecx + xvid_FIR_0_1_3_6 wrt ..gotoff  , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff,  mm1, mm2, mm3
-+  TACCUM3 10,  ecx + xvid_FIR_0_0_1_3 wrt ..gotoff  , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff,  mm1, mm2, mm3
-+  TACCUM3 11,  ecx + xvid_FIR_0_0_0_1 wrt ..gotoff  , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff,  mm1, mm2, mm3
-+
-+  TACCUM2 12,  ecx + xvid_FIR_1_3_6_20 wrt ..gotoff, ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm2, mm3
-+  TACCUM2 13,  ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm2, mm3
-+  TACCUM2 14,  ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm2, mm3
-+  TACCUM2 15,  ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm2, mm3
- 
- %endif
- 
-@@ -418,7 +427,7 @@ SECTION .text
-   MIX mm0, esi+1, ebx
- %endif
- %if (%2==1)
--  MIX mm0, edi, Rounder1_MMX
-+  MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff
- %endif
- 
- %if (%1==1)
-@@ -427,7 +436,7 @@ SECTION .text
-   MIX mm2, esi+9, ebx
- %endif
- %if (%2==1)
--  MIX mm2, edi+8, Rounder1_MMX
-+  MIX mm2, edi+8, ecx + Rounder1_MMX wrt ..gotoff
- %endif
- 
-   lea esi, [esi+ebp]
-@@ -436,7 +445,7 @@ SECTION .text
-   movq [edi+8], mm2
- 
-   add edi, ebp
--  dec ecx
-+  dec dword [esp+20 + 2*4]
-   jg .Loop
- 
- %if (%2==0) && (%1==0)
-@@ -464,64 +473,64 @@ SECTION .text
- %ifndef USE_TABLES
- 
-   LOAD 0, 8  ; special case for 1rst/last pixel
--  ACCUM2 1,  FIR_R1,  mm0, mm3
--  ACCUM2 2,  FIR_R2,  mm0, mm3
--  ACCUM2 3,  FIR_R3,  mm0, mm3
--  ACCUM2 4,  FIR_R4,  mm0, mm3
--
--  ACCUM2 5,  FIR_R13,  mm0, mm3
--  ACCUM2 6,  FIR_R14,  mm0, mm3
--  ACCUM2 7,  FIR_R15,  mm0, mm3
-+  ACCUM2 1,  ecx + FIR_R1 wrt ..gotoff,  mm0, mm3
-+  ACCUM2 2,  ecx + FIR_R2 wrt ..gotoff,  mm0, mm3
-+  ACCUM2 3,  ecx + FIR_R3 wrt ..gotoff,  mm0, mm3
-+  ACCUM2 4,  ecx + FIR_R4 wrt ..gotoff,  mm0, mm3
-+
-+  ACCUM2 5,  ecx + FIR_R13 wrt ..gotoff,  mm0, mm3
-+  ACCUM2 6,  ecx + FIR_R14 wrt ..gotoff,  mm0, mm3
-+  ACCUM2 7,  ecx + FIR_R15 wrt ..gotoff,  mm0, mm3
- 
- %else
- 
- %if 0   ; test with no unrolling
- 
-   TLOAD 0, 8  ; special case for 1rst/last pixel
--  TACCUM2 1,  xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0  , mm0, mm3
--  TACCUM2 2,  xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0  , mm0, mm3
--  TACCUM2 3,  xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0  , mm0, mm3
--  TACCUM2 4,  xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3
--  TACCUM2 5,  xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, mm0, mm3
--  TACCUM2 6,  xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_7, mm0, mm3
--  TACCUM2 7,  xvid_FIR_0_0_0_1  , xvid_FIR_3_6_19_23, mm0, mm3
-+  TACCUM2 1,  ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff  , mm0, mm3
-+  TACCUM2 2,  ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff  , mm0, mm3
-+  TACCUM2 3,  ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff  , mm0, mm3
-+  TACCUM2 4,  ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm0, mm3
-+  TACCUM2 5,  ecx + xvid_FIR_0_1_3_6 wrt ..gotoff  , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm0, mm3
-+  TACCUM2 6,  ecx + xvid_FIR_0_0_1_3 wrt ..gotoff  , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm0, mm3
-+  TACCUM2 7,  ecx + xvid_FIR_0_0_0_1 wrt ..gotoff  , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm0, mm3
- 
- %else  ; test with unrolling (little faster, but not much)
- 
-   movzx eax, byte [esi]
-   movzx edx, byte [esi+8]
--  movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ]
-+  movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff]
-   movzx eax, byte [esi+1]
--  movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ]
-+  movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff]
-   paddw mm0, mm7
-   paddw mm3, mm7
- 
-   movzx edx, byte [esi+2]
--  paddw mm0, [xvid_FIR_23_19_6_3 + eax*8]
--  paddw mm3, [xvid_FIR_1_0_0_0 + eax*8]
-+  paddw mm0, [ecx + xvid_FIR_23_19_6_3 + eax*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_1_0_0_0 + eax*8 wrt ..gotoff]
- 
-   movzx eax, byte [esi+3]
--  paddw mm0, [xvid_FIR_7_20_20_6 + edx*8]
--  paddw mm3, [xvid_FIR_3_1_0_0 + edx*8]
-+  paddw mm0, [ecx + xvid_FIR_7_20_20_6 + edx*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_3_1_0_0 + edx*8 wrt ..gotoff]
- 
-   movzx edx, byte [esi+4]
--  paddw mm0, [xvid_FIR_3_6_20_20 + eax*8]
--  paddw mm3, [xvid_FIR_6_3_1_0 + eax*8]
-+  paddw mm0, [ecx + xvid_FIR_3_6_20_20 + eax*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_6_3_1_0 + eax*8 wrt ..gotoff]
- 
-   movzx eax, byte [esi+5]
--  paddw mm0, [xvid_FIR_1_3_6_20 + edx*8]
--  paddw mm3, [xvid_FIR_20_6_3_1 + edx*8]
-+  paddw mm0, [ecx + xvid_FIR_1_3_6_20 + edx*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_20_6_3_1 + edx*8 wrt ..gotoff]
- 
-   movzx edx, byte [esi+6]
--  paddw mm0, [xvid_FIR_0_1_3_6 + eax*8]
--  paddw mm3, [xvid_FIR_20_20_6_3 + eax*8]
-+  paddw mm0, [ecx + xvid_FIR_0_1_3_6 + eax*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_20_20_6_3 + eax*8 wrt ..gotoff]
- 
-   movzx eax, byte [esi+7]
--  paddw mm0, [xvid_FIR_0_0_1_3 + edx*8]
--  paddw mm3, [xvid_FIR_6_20_20_7 + edx*8]
-+  paddw mm0, [ecx + xvid_FIR_0_0_1_3 + edx*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_6_20_20_7 + edx*8 wrt ..gotoff]
- 
--  paddw mm0, [xvid_FIR_0_0_0_1 + eax*8]
--  paddw mm3, [xvid_FIR_3_6_19_23 + eax*8]
-+  paddw mm0, [ecx + xvid_FIR_0_0_0_1 + eax*8 wrt ..gotoff]
-+  paddw mm3, [ecx + xvid_FIR_3_6_19_23 + eax*8 wrt ..gotoff]
- 
- %endif
- 
-@@ -537,14 +546,14 @@ SECTION .text
-   MIX mm0, esi+1, ebx
- %endif
- %if (%2==1)
--  MIX mm0, edi, Rounder1_MMX
-+  MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff
- %endif
- 
-   movq [edi], mm0
- 
-   add edi, ebp
-   add esi, ebp
--  dec ecx
-+  dec dword [esp+20 + 2*4]
-   jg .Loop
- 
- %if (%2==0) && (%1==0)
-@@ -678,7 +687,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   V_MIX %3, esi, ebx
- %endif
- %if (%2==1)
--  V_MIX %3, edi, Rounder1_MMX
-+  V_MIX %3, edi, ecx + Rounder1_MMX wrt ..gotoff
- %endif
- 
-   movd eax, %3
-@@ -718,28 +727,28 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   movq mm3, mm7
- 
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2,  FIR_Cm1
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_STORE %1, %2, mm0, 0
- 
-   V_LOAD 0
--  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
--  V_ACC1 mm3, FIR_Cm6
-+  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
-   V_STORE %1, %2, mm1, 0
- 
-   V_LOAD 0
--  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_STORE %1, %2, mm2, 0
- 
-   V_LOAD 1
--  V_ACC1 mm3, FIR_Cm1
-+  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
-   V_STORE %1, %2, mm3, 0
- 
-     ; ouput rows [4..7], from input rows [1..11] (!!)
-@@ -756,38 +765,38 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   movq mm3, mm7
- 
-   V_LOAD 0
--  V_ACC1 mm0, FIR_Cm1
-+  V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2l mm0, mm1, FIR_C3,  FIR_Cm1
-+  V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2 mm0, mm1, FIR_Cm6,  FIR_C3
--  V_ACC1 mm2, FIR_Cm1
-+  V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_STORE %1, %2, mm0, 0
- 
-   V_LOAD 0
--  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
--  V_ACC1 mm3, FIR_Cm6
-+  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
-   V_STORE %1, %2, mm1, 0
- 
-   V_LOAD 0
--  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_STORE %1, %2, mm2, 0
- 
-   V_LOAD 1
--  V_ACC1 mm3, FIR_Cm1
-+  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
-   V_STORE %1, %2, mm3, 0
- 
-     ; ouput rows [8..11], from input rows [5..15]
-@@ -804,39 +813,39 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   movq mm3, mm7
- 
-   V_LOAD 0
--  V_ACC1 mm0, FIR_Cm1
-+  V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2l mm0, mm1, FIR_C3,  FIR_Cm1
-+  V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2 mm0, mm1, FIR_Cm6,  FIR_C3
--  V_ACC1 mm2, FIR_Cm1
-+  V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- 
-   V_STORE %1, %2, mm0, 0
- 
-   V_LOAD 0
--  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
--  V_ACC1 mm3, FIR_Cm6
-+  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
-   V_STORE %1, %2, mm1, 0
- 
-   V_LOAD 0
--  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_STORE %1, %2, mm2, 0
- 
-   V_LOAD 1
--  V_ACC1 mm3, FIR_Cm1
-+  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
-   V_STORE %1, %2, mm3, 0
- 
- 
-@@ -855,25 +864,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   movq mm3, mm7
- 
-   V_LOAD 0
--  V_ACC1 mm3, FIR_Cm1
-+  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
-+  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
--  V_ACC1 mm3, FIR_Cm6
-+  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_LOAD 1
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_STORE %1, %2, mm3, 0
-   V_STORE %1, %2, mm2, 0
-@@ -886,7 +895,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   pop edi
-   add esi, 4
-   add edi, 4
--  sub ecx, 4
-+  sub dword [esp+20 + 2*4], 4
-   jg .Loop
- 
- %if (%2==0) && (%1==0)
-@@ -924,29 +933,29 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   movq mm3, mm7
- 
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2,  FIR_Cm1
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_STORE %1, %2, mm0, 0
- 
-   V_LOAD 0
--  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
--  V_ACC1 mm3, FIR_Cm6
-+  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
- 
-   V_STORE %1, %2, mm1, 0
- 
-   V_LOAD 0
--  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
-+  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-   V_STORE %1, %2, mm2, 0
- 
-   V_LOAD 1
--  V_ACC1 mm3, FIR_Cm1
-+  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
-   V_STORE %1, %2, mm3, 0
- 
-     ; ouput rows [4..7], from input rows [1..9]
-@@ -964,25 +973,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   movq mm3, mm7
- 
-   V_LOAD 0
--  V_ACC1 mm3, FIR_Cm1
-+  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
-+  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
--  V_ACC1 mm3, FIR_Cm6
-+  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
-+  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
- 
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
-   V_LOAD 0
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-   V_LOAD 1
--  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
-+  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
- 
-   V_STORE %1, %2, mm3, 0
-   V_STORE %1, %2, mm2, 0
-@@ -995,7 +1004,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
-   pop edi
-   add esi, 4
-   add edi, 4
--  sub ecx, 4
-+  sub dword [esp+20 + 2*4], 4
-   jg .Loop
- 
- %if (%2==0) && (%1==0)
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/reduced_mmx.asm xvidcore-1.1.2/src/image/x86_asm/reduced_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/reduced_mmx.asm	2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/reduced_mmx.asm	2007-01-27 13:33:30.000000000 +0100
-@@ -91,8 +91,8 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm
-   pmullw mm4, %4 ; [Up31]
-   pmullw %2,  %3 ; [Up13]
-   pmullw mm5, %4 ; [Up31]
--  paddsw %1, [Cst2]
--  paddsw %2, [Cst2]
-+  paddsw %1, [ebp + Cst2 wrt ..gotoff]
-+  paddsw %2, [ebp + Cst2 wrt ..gotoff]
-   paddsw %1, mm4
-   paddsw %2, mm5
- %endmacro
-@@ -126,14 +126,14 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm
- 
- %macro MIX_ROWS 4   ; %1/%2:prev %3/4:cur (preserved)  mm4/mm5: output
-   ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved.
--  movq mm4, [Cst3]
--  movq mm5, [Cst3]
-+  movq mm4, [ebp + Cst3 wrt ..gotoff]
-+  movq mm5, [ebp + Cst3 wrt ..gotoff]
-   pmullw mm4, %3
-   pmullw mm5, %4
-   paddsw mm4, %1
-   paddsw mm5, %2
--  pmullw %1, [Cst3]
--  pmullw %2, [Cst3]
-+  pmullw %1, [ebp + Cst3 wrt ..gotoff]
-+  pmullw %2, [ebp + Cst3 wrt ..gotoff]
-   paddsw %1, %3
-   paddsw %2, %4
- %endmacro
<Skipped 1787 lines>
================================================================

---- gitweb:

http://git.pld-linux.org/gitweb.cgi/packages/xvid.git/commitdiff/f24cf361b3f55380b384d2dbce4e74092da8bfa0




More information about the pld-cvs-commit mailing list