[packages/xvid] remove textrel patch dropped from spec in 9731c6c
atler
atler at pld-linux.org
Wed Nov 26 00:09:51 CET 2025
commit 78e2c6266c54c7c18ad4c62bbf9a0831dbf7e49f
Author: Jan Palus <atler at pld-linux.org>
Date: Wed Nov 26 00:01:17 2025 +0100
remove textrel patch dropped from spec in 9731c6c
xvid-1.1.2-textrel.patch | 5757 ----------------------------------------------
1 file changed, 5757 deletions(-)
---
diff --git a/xvid-1.1.2-textrel.patch b/xvid-1.1.2-textrel.patch
deleted file mode 100644
index 2d3d773..0000000
--- a/xvid-1.1.2-textrel.patch
+++ /dev/null
@@ -1,5757 +0,0 @@
-diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm
---- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -50,23 +50,6 @@ BITS 32
- %endmacro
-
- ;=============================================================================
--; Local data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--
--mult_mask:
-- db 0x10,0x20,0x04,0x08,0x01,0x02,0x00,0x00
--ignore_dc:
-- dw 0, -1, -1, -1
--
--;=============================================================================
- ; Code
- ;=============================================================================
-
-@@ -91,7 +74,12 @@ ALIGN 16
- calc_cbp_mmx:
- mov eax, [esp + 4] ; coeff
-
-- movq mm7, [ignore_dc]
-+ push byte 0 ; align esp to 8 bytes
-+ push byte -1
-+ push dword 0xFFFF0000
-+ movq mm7, [esp]
-+ add esp, byte 8
-+
- pxor mm6, mm6 ; used only for comparing
- movq mm0, [eax+128*0]
- movq mm1, [eax+128*1]
-@@ -123,7 +111,11 @@ calc_cbp_mmx:
- MAKE_LOAD 13
- MAKE_LOAD 14
-
-- movq mm7, [mult_mask]
-+ push dword 0x00000201
-+ push dword 0x08042010
-+ movq mm7, [esp]
-+ add esp, byte 12
-+
- packssdw mm0, mm1
- packssdw mm2, mm3
- packssdw mm4, mm5
-diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm
---- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -69,20 +69,6 @@ BITS 32
- %endmacro
-
- ;=============================================================================
--; Data (Read Only)
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--ignore_dc:
-- dw 0, -1, -1, -1, -1, -1, -1, -1
--
--;=============================================================================
- ; Code
- ;=============================================================================
-
-@@ -98,7 +84,13 @@ calc_cbp_sse2:
- mov edx, [esp+4] ; coeff[]
- xor eax, eax ; cbp = 0
-
-- movdqu xmm7, [ignore_dc] ; mask to ignore dc value
-+ sub esp,byte 12 ; align esp to 16 bytes
-+ push byte -1
-+ push byte -1
-+ push byte -1
-+ push dword 0xFFFF0000
-+ movdqu xmm7, [esp] ; mask to ignore dc value
-+ add esp, byte 28
- pxor xmm6, xmm6 ; zero
-
- LOOP_SSE2 0
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -204,7 +204,7 @@ fdct_r_row:
- psllw mm4, SHIFT_FRW_COL
- movq mm6, mm0
- psubsw mm2, mm1
-- movq mm1, [fdct_tg_all_16 + 4*2]
-+ movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
- psubsw mm0, mm4
- movq mm7, [%2 + %3*2 + 3*16]
- pmulhw mm1, mm0
-@@ -216,9 +216,9 @@ fdct_r_row:
- psubsw mm5, mm7
- paddsw mm1, mm5
- paddsw mm4, mm7
-- por mm1, [fdct_one_corr]
-+ por mm1, [ebx + fdct_one_corr wrt ..gotoff]
- psllw mm2, SHIFT_FRW_COL + 1
-- pmulhw mm5, [fdct_tg_all_16 + 4*2]
-+ pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
- movq mm7, mm4
- psubsw mm3, [%2 + %3*2 + 5*16]
- psubsw mm4, mm6
-@@ -230,34 +230,34 @@ fdct_r_row:
- movq mm6, mm2
- movq [%1 + %3*2 + 4*16], mm4
- paddsw mm2, mm3
-- pmulhw mm2, [ocos_4_16]
-+ pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff]
- psubsw mm6, mm3
-- pmulhw mm6, [ocos_4_16]
-+ pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff]
- psubsw mm5, mm0
-- por mm5, [fdct_one_corr]
-+ por mm5, [ebx + fdct_one_corr wrt ..gotoff]
- psllw mm1, SHIFT_FRW_COL
-- por mm2, [fdct_one_corr]
-+ por mm2, [ebx + fdct_one_corr wrt ..gotoff]
- movq mm4, mm1
- movq mm3, [%2 + %3*2 + 0*16]
- paddsw mm1, mm6
- psubsw mm3, [%2 + %3*2 + 7*16]
- psubsw mm4, mm6
-- movq mm0, [fdct_tg_all_16 + 0*2]
-+ movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
- psllw mm3, SHIFT_FRW_COL
-- movq mm6, [fdct_tg_all_16 + 8*2]
-+ movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
- pmulhw mm0, mm1
- movq [%1 + %3*2 + 0*16], mm7
- pmulhw mm6, mm4
- movq [%1 + %3*2 + 6*16], mm5
- movq mm7, mm3
-- movq mm5, [fdct_tg_all_16 + 8*2]
-+ movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
- psubsw mm7, mm2
- paddsw mm3, mm2
- pmulhw mm5, mm7
- paddsw mm0, mm3
- paddsw mm6, mm4
-- pmulhw mm3, [fdct_tg_all_16 + 0*2]
-- por mm0, [fdct_one_corr]
-+ pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
-+ por mm0, [ebx + fdct_one_corr wrt ..gotoff]
- paddsw mm5, mm7
- psubsw mm7, mm6
- movq [%1 + %3*2 + 1*16], mm0
-@@ -287,28 +287,28 @@ fdct_r_row:
- movq mm6, mm5
- punpckldq mm3, mm5
- punpckhdq mm6, mm3
-- movq mm3, [%3 + 0*2]
-- movq mm4, [%3 + 4*2]
-+ movq mm3, [0*2 + %3]
-+ movq mm4, [4*2 + %3]
- punpckldq mm2, mm0
- pmaddwd mm3, mm0
- punpckhdq mm1, mm2
-- movq mm2, [%3 + 16*2]
-+ movq mm2, [16*2 + %3]
- pmaddwd mm4, mm1
-- pmaddwd mm0, [%3 + 8*2]
-- movq mm7, [%3 + 20*2]
-+ pmaddwd mm0, [8*2 + %3]
-+ movq mm7, [20*2 + %3]
- pmaddwd mm2, mm5
-- paddd mm3, [fdct_r_row]
-+ paddd mm3, [ebx + fdct_r_row wrt ..gotoff]
- pmaddwd mm7, mm6
-- pmaddwd mm1, [%3 + 12*2]
-+ pmaddwd mm1, [12*2 + %3]
- paddd mm3, mm4
-- pmaddwd mm5, [%3 + 24*2]
-- pmaddwd mm6, [%3 + 28*2]
-+ pmaddwd mm5, [24*2 + %3]
-+ pmaddwd mm6, [28*2 + %3]
- paddd mm2, mm7
-- paddd mm0, [fdct_r_row]
-+ paddd mm0, [ebx + fdct_r_row wrt ..gotoff]
- psrad mm3, SHIFT_FRW_ROW
-- paddd mm2, [fdct_r_row]
-+ paddd mm2, [ebx + fdct_r_row wrt ..gotoff]
- paddd mm0, mm1
-- paddd mm5, [fdct_r_row]
-+ paddd mm5, [ebx + fdct_r_row wrt ..gotoff]
- psrad mm2, SHIFT_FRW_ROW
- paddd mm5, mm6
- psrad mm0, SHIFT_FRW_ROW
-@@ -336,23 +336,23 @@ fdct_r_row:
- psubsw mm1, mm5
- pshufw mm2, mm0, 0x4E
- pshufw mm3, mm1, 0x4E
-- movq mm4, [%3 + 0*2]
-- movq mm6, [%3 + 4*2]
-- movq mm5, [%3 + 16*2]
-- movq mm7, [%3 + 20*2]
-+ movq mm4, [ 0*2 + %3]
-+ movq mm6, [ 4*2 + %3]
-+ movq mm5, [16*2 + %3]
-+ movq mm7, [20*2 + %3]
- pmaddwd mm4, mm0
- pmaddwd mm5, mm1
- pmaddwd mm6, mm2
- pmaddwd mm7, mm3
-- pmaddwd mm0, [%3 + 8*2]
-- pmaddwd mm2, [%3 + 12*2]
-- pmaddwd mm1, [%3 + 24*2]
-- pmaddwd mm3, [%3 + 28*2]
-+ pmaddwd mm0, [ 8*2 + %3]
-+ pmaddwd mm2, [12*2 + %3]
-+ pmaddwd mm1, [24*2 + %3]
-+ pmaddwd mm3, [28*2 + %3]
- paddd mm4, mm6
- paddd mm5, mm7
- paddd mm0, mm2
- paddd mm1, mm3
-- movq mm7, [fdct_r_row]
-+ movq mm7, [ebx + fdct_r_row wrt ..gotoff]
- paddd mm4, mm7
- paddd mm5, mm7
- paddd mm0, mm7
-@@ -377,6 +377,10 @@ cglobal %1
- ;; Move the destination/source address to the eax register
- mov eax, [esp + 4]
-
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- ;; Process the columns (4 at a time)
- FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
- FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
-@@ -386,12 +390,12 @@ cglobal %1
- %assign i 0
- %rep 8
- ;; Process the 'i'th row
-- %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
-+ %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff
- %assign i i+1
- %endrep
- %else
- mov ecx, 8
-- mov edx, tab_frw_01234567
-+ mov edx, [ebx + tab_frw_01234567 wrt ..gotoff]
- ALIGN 8
- .loop
- %2 eax, eax, edx
-@@ -401,6 +405,7 @@ ALIGN 8
- jne .loop
- %endif
-
-+ pop ebx
- ret
- .endfunc
- %endmacro
-@@ -411,6 +416,11 @@ ALIGN 8
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- ;-----------------------------------------------------------------------------
- ; void fdct_mmx_ffmpeg(int16_t block[64]);
- ;-----------------------------------------------------------------------------
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -294,15 +294,15 @@ MMX_One:
- paddsw mm2, mm1 ; mm2: t6+t5
- movq [%1+0*16], mm5 ; => out0
-
-- movq mm4, [tan2] ; mm4 <= tan2
-+ movq mm4, [ebx + tan2 wrt ..gotoff] ; mm4 <= tan2
- pmulhw mm4, mm7 ; tm03*tan2
-- movq mm5, [tan2] ; mm5 <= tan2
-+ movq mm5, [ebx + tan2 wrt ..gotoff] ; mm5 <= tan2
- psubsw mm4, mm6 ; out6 = tm03*tan2 - tm12
- pmulhw mm5, mm6 ; tm12*tan2
- paddsw mm5, mm7 ; out2 = tm12*tan2 + tm03
-
-- movq mm6, [sqrt2]
-- movq mm7, [MMX_One]
-+ movq mm6, [ebx + sqrt2 wrt ..gotoff]
-+ movq mm7, [ebx + MMX_One wrt ..gotoff]
-
- pmulhw mm2, mm6 ; mm2: tp65 = (t6 + t5)*cos4
- por mm5, mm7 ; correct out2
-@@ -320,8 +320,8 @@ MMX_One:
- paddsw mm2, mm4 ; mm2: tp765 = t7 + tp65
- paddsw mm1, mm5 ; mm1: tp465 = t4 + tm65
-
-- movq mm4, [tan3] ; tan3 - 1
-- movq mm5, [tan1] ; tan1
-+ movq mm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1
-+ movq mm5, [ebx + tan1 wrt ..gotoff] ; tan1
-
- movq mm7, mm3 ; save tm465
- pmulhw mm3, mm4 ; tm465*(tan3-1)
-@@ -364,23 +364,23 @@ MMX_One:
- punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1]
- punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3]
-
-- movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17]
-- movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19]
-+ movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17]
-+ movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19]
- pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17]
-- movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21]
-+ movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21]
- pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19]
-- movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23]
-+ movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23]
- pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21]
-- movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25]
-+ movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25]
- pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23]
-- movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27]
-+ movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27]
- pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25]
- paddd mm2, mm3 ; [ out0 | out1 ]
- pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27]
- psrad mm2, 16
-- pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
-+ pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
- paddd mm4, mm5 ; [ out2 | out3 ]
-- pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
-+ pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
- psrad mm4, 16
-
- paddd mm6, mm7 ; [ out4 | out5 ]
-@@ -422,23 +422,23 @@ MMX_One:
- punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1]
- punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3]
-
-- movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17]
-- movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19]
-+ movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17]
-+ movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19]
- pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17]
-- movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21]
-+ movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21]
- pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19]
-- movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23]
-+ movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23]
- pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21]
-- movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25]
-+ movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25]
- pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23]
-- movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27]
-+ movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27]
- pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25]
- paddd mm2, mm3 ; [ out0 | out1 ]
- pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27]
- psrad mm2, 16
-- pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
-+ pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
- paddd mm4, mm5 ; [ out2 | out3 ]
-- pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
-+ pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
- psrad mm4, 16
-
- paddd mm6, mm7 ; [ out4 | out5 ]
-@@ -467,12 +467,16 @@ MMX_One:
- ALIGN 16
- cglobal %1
- %1:
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- %ifdef UNROLLED_LOOP
-- mov ecx, [esp + 4]
-+ mov ecx, [esp + 4 + 4]
- %else
-- push ebx
-+ push esi
- push edi
-- mov ecx, [esp + 8 + 4]
-+ mov ecx, [esp + 12 + 4]
- %endif
-
- fLLM_PASS ecx+0, ecx+0, 3
-@@ -481,27 +485,28 @@ cglobal %1
- %ifdef UNROLLED_LOOP
- %assign i 0
- %rep 8
-- %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8
-+ %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff
- %assign i i+1
- %endrep
- %else
- mov eax, 8
-- mov edx, fdct_table
-- mov ebx, fdct_rounding_1
-- mov edi, fdct_rounding_2
-+ lea edx, [ebx + fdct_table wrt ..gotoff]
-+ lea esi, [ebx + fdct_rounding_1 wrt ..gotoff]
-+ lea edi, [ebx + fdct_rounding_2 wrt ..gotoff]
- .loop
-- %2 ecx, ecx, edx, ebx, edi
-+ %2 ecx, ecx, edx, esi, edi
- add ecx, 2*8
- add edx, 2*32
-- add ebx, 2*4
-+ add esi, 2*4
- add edi, 2*4
- dec eax
- jne .loop
-
- pop edi
-- pop ebx
-+ pop esi
- %endif
-
-+ pop ebx
- ret
- .endfunc
- %endmacro
-@@ -512,6 +517,11 @@ cglobal %1
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- ;-----------------------------------------------------------------------------
- ; void fdct_mmx_skal(int16_t block[64]];
- ;-----------------------------------------------------------------------------
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -238,10 +238,10 @@ cglobal fdct_sse2_skal
- pshufd xmm6, xmm0, 01010101b ; [13131313]
- pshufd xmm7, xmm0, 11111111b ; [57575757]
-
-- pmaddwd xmm4, [%2+ 0] ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
-- pmaddwd xmm5, [%2+16] ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
-- pmaddwd xmm6, [%2+32] ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
-- pmaddwd xmm7, [%2+48] ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
-+ pmaddwd xmm4, [ 0 + %2] ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
-+ pmaddwd xmm5, [16 + %2] ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
-+ pmaddwd xmm6, [32 + %2] ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
-+ pmaddwd xmm7, [48 + %2] ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
- paddd xmm4, [%3] ; Round
-
- paddd xmm6, xmm7 ; [b0|b1|b2|b3]
-@@ -267,12 +267,12 @@ cglobal fdct_sse2_skal
-
- %macro iLLM_PASS 1 ; %1: src/dst
-
-- movdqa xmm0, [tan3] ; t3-1
-+ movdqa xmm0, [ebx + tan3 wrt ..gotoff] ; t3-1
- movdqa xmm3, [%1+16*3] ; x3
- movdqa xmm1, xmm0 ; t3-1
- movdqa xmm5, [%1+16*5] ; x5
-
-- movdqa xmm4, [tan1] ; t1
-+ movdqa xmm4, [ebx + tan1 wrt ..gotoff] ; t1
- movdqa xmm6, [%1+16*1] ; x1
- movdqa xmm7, [%1+16*7] ; x7
- movdqa xmm2, xmm4 ; t1
-@@ -290,7 +290,7 @@ cglobal fdct_sse2_skal
- psubsw xmm2, xmm7 ; x1*t1-x7 = tm17
-
-
-- movdqa xmm3, [sqrt2]
-+ movdqa xmm3, [ebx + sqrt2 wrt ..gotoff]
- movdqa xmm7, xmm4
- movdqa xmm6, xmm2
- psubsw xmm4, xmm1 ; tp17-tp35 = t1
-@@ -310,7 +310,7 @@ cglobal fdct_sse2_skal
- paddsw xmm0, xmm0 ; 2.(t1+t2) = b1
- paddsw xmm4, xmm4 ; 2.(t1-t2) = b2
-
-- movdqa xmm7, [tan2] ; t2
-+ movdqa xmm7, [ebx + tan2 wrt ..gotoff] ; t2
- movdqa xmm3, [%1+2*16] ; x2
- movdqa xmm6, [%1+6*16] ; x6
- movdqa xmm5, xmm7 ; t2
-@@ -402,55 +402,58 @@ cglobal fdct_sse2_skal
-
- ALIGN 16
- idct_sse2_skal:
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-
-- mov ecx, [esp+ 4] ; Src
-+ mov ecx, [esp+ 4 +4] ; Src
-
- TEST_ROW ecx, .Row0_Round
-- iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11
-+ iMTX_MULT 0, ebx + iTab1 wrt ..gotoff, ebx + 16*0 + Walken_Idct_Rounders wrt ..gotoff, 11
- jmp .Row1
- .Row0_Round
-- movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
-+ movdqa xmm0, [ebx + 16*8 + 8*0 + Walken_Idct_Rounders wrt ..gotoff]
- movdqa [ecx ], xmm0
-
- .Row1
- TEST_ROW ecx+16, .Row1_Round
-- iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11
-+ iMTX_MULT 1, ebx + iTab2 wrt ..gotoff, ebx + 16*1 + Walken_Idct_Rounders wrt ..gotoff, 11
- jmp .Row2
- .Row1_Round
-- movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
-+ movdqa xmm0, [ebx + 16*8 + 16*1 + Walken_Idct_Rounders wrt ..gotoff]
- movdqa [ecx+16 ], xmm0
-
- .Row2
- TEST_ROW ecx+32, .Row2_Round
-- iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11
-+ iMTX_MULT 2, ebx + iTab3 wrt ..gotoff, ebx + 16*2 + Walken_Idct_Rounders wrt ..gotoff, 11
- jmp .Row3
- .Row2_Round
-- movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
-+ movdqa xmm0, [ebx + 16*8 + 16*2 + Walken_Idct_Rounders wrt ..gotoff]
- movdqa [ecx+32 ], xmm0
-
- .Row3
- TEST_ROW ecx+48, .Row4
-- iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11
-+ iMTX_MULT 3, ebx + iTab4 wrt ..gotoff, ebx + 16*3 + Walken_Idct_Rounders wrt ..gotoff, 11
-
- .Row4
- TEST_ROW ecx+64, .Row5
-- iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11
-+ iMTX_MULT 4, ebx + iTab1 wrt ..gotoff, ebx + 16*4 + Walken_Idct_Rounders wrt ..gotoff, 11
-
- .Row5
- TEST_ROW ecx+80, .Row6
-- iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11
-+ iMTX_MULT 5, ebx + iTab4 wrt ..gotoff, ebx + 16*5 + Walken_Idct_Rounders wrt ..gotoff, 11
-
- .Row6
- TEST_ROW ecx+96, .Row7
-- iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11
-+ iMTX_MULT 6, ebx + iTab3 wrt ..gotoff, ebx + 16*6 + Walken_Idct_Rounders wrt ..gotoff, 11
-
- .Row7
- TEST_ROW ecx+112, .End
-- iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11
-+ iMTX_MULT 7, ebx + iTab2 wrt ..gotoff, ebx + 16*7 + Walken_Idct_Rounders wrt ..gotoff, 11
- .End
-
- iLLM_PASS ecx
--
-+ pop ebx
- ret
- .endfunc
-
-@@ -507,15 +510,15 @@ idct_sse2_skal:
- paddsw xmm2, xmm1 ; xmm2: t6+t5
- movdqa [%1+0*16], xmm5 ; => out0
-
-- movdqa xmm4, [tan2] ; xmm4 <= tan2
-+ movdqa xmm4, [ebx + tan2 wrt ..gotoff] ; xmm4 <= tan2
- pmulhw xmm4, xmm7 ; tm03*tan2
-- movdqa xmm5, [tan2] ; xmm5 <= tan2
-+ movdqa xmm5, [ebx + tan2 wrt ..gotoff] ; xmm5 <= tan2
- psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12
- pmulhw xmm5, xmm6 ; tm12*tan2
- paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03
-
-- movdqa xmm6, [sqrt2]
-- movdqa xmm7, [Rounder1]
-+ movdqa xmm6, [ebx + sqrt2 wrt ..gotoff]
-+ movdqa xmm7, [ebx + Rounder1 wrt ..gotoff]
-
- pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4
- por xmm5, xmm7 ; correct out2
-@@ -533,8 +536,8 @@ idct_sse2_skal:
- paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65
- paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65
-
-- movdqa xmm4, [tan3] ; tan3 - 1
-- movdqa xmm5, [tan1] ; tan1
-+ movdqa xmm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1
-+ movdqa xmm5, [ebx + tan1 wrt ..gotoff] ; tan1
-
- movdqa xmm7, xmm3 ; save tm465
- pmulhw xmm3, xmm4 ; tm465*(tan3-1)
-@@ -581,12 +584,12 @@ idct_sse2_skal:
- ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7']
- ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ]
-
-- movdqa xmm1, [%2+16]
-- movdqa xmm3, [%2+32]
-+ movdqa xmm1, [16+%2]
-+ movdqa xmm3, [32+%2]
- pmaddwd xmm1, xmm2
- pmaddwd xmm3, xmm0
-- pmaddwd xmm2, [%2+48]
-- pmaddwd xmm0, [%2+ 0]
-+ pmaddwd xmm2, [48+%2]
-+ pmaddwd xmm0, [ 0+%2]
-
- paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ]
- paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ]
-@@ -601,22 +604,33 @@ idct_sse2_skal:
- movdqa [ecx+%1*16+0], xmm0
- %endmacro
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- ;-----------------------------------------------------------------------------
- ; Function Forward DCT
- ;-----------------------------------------------------------------------------
-
- ALIGN 16
- fdct_sse2_skal:
-- mov ecx, [esp+4]
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov ecx, [esp+4+4]
- fLLM_PASS ecx+0, 3
-- fMTX_MULT 0, fTab1, Fdct_Rnd0
-- fMTX_MULT 1, fTab2, Fdct_Rnd2
-- fMTX_MULT 2, fTab3, Fdct_Rnd1
-- fMTX_MULT 3, fTab4, Fdct_Rnd1
-- fMTX_MULT 4, fTab1, Fdct_Rnd0
-- fMTX_MULT 5, fTab4, Fdct_Rnd1
-- fMTX_MULT 6, fTab3, Fdct_Rnd1
-- fMTX_MULT 7, fTab2, Fdct_Rnd1
-+ fMTX_MULT 0, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff
-+ fMTX_MULT 1, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd2 wrt ..gotoff
-+ fMTX_MULT 2, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+ fMTX_MULT 3, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+ fMTX_MULT 4, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff
-+ fMTX_MULT 5, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+ fMTX_MULT 6, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+ fMTX_MULT 7, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
-+
-+ pop ebx
- ret
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/idct_3dne.asm xvidcore-1.1.2/src/dct/x86_asm/idct_3dne.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/idct_3dne.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/idct_3dne.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -223,6 +223,11 @@ tab_i_35_xmm:
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- cglobal idct_3dne
-
- ;-----------------------------------------------------------------------------
-@@ -231,25 +236,29 @@ cglobal idct_3dne
-
- ALIGN 16
- idct_3dne:
-- mov eax, [esp+4]
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp+4+4]
-
- ; DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0
- pshufw mm0, [eax+64],10001000b ; x2 x0 x2 x0
-- movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- pshufw mm1, [eax+64+8],10001000b ; x6 x4 x6 x4
-- movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02
-+ movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
- pshufw mm2, [eax+64],11011101b ; x3 x1 x3 x1
- pshufw mm5, [eax+64+8],11011101b ; x7 x5 x7 x5
-- movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
-- movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 ;
-- pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 ;
-+ pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2)
- pshufw mm1, [eax+80+8],10001000b ; x6 x4 x6 x4
-@@ -260,12 +269,12 @@ idct_3dne:
- movq mm7, mm0 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0
- paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm0, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1
- pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -276,19 +285,19 @@ idct_3dne:
- movq [eax+64], mm6 ; 3 ; save y3 y2 y1 y0 stall2
-
- ; DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5
-- movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4
-- paddd mm3, [rounder_5] ; +rounder stall 6
-- paddd mm5, [rounder_5] ; +rounder
-+ paddd mm3, [ebx + rounder_5 wrt ..gotoff] ; +rounder stall 6
-+ paddd mm5, [ebx + rounder_5 wrt ..gotoff] ; +rounder
- movq [eax+64+8], mm7 ; 7 ; save y7 y6 y5 y4
-- movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2)
- pshufw mm1, [eax+96+8],10001000b ; x6 x4 x6 x4
-@@ -299,12 +308,12 @@ idct_3dne:
- movq mm7, mm5 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5
- paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm5, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1
- pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -315,19 +324,19 @@ idct_3dne:
- movq [eax+80], mm6 ; 3 ; save y3 y2 y1 y0
-
- ; DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6
-- movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6
-- paddd mm3, [rounder_6] ; +rounder
-- paddd mm0, [rounder_6] ; +rounder
-+ paddd mm3, [ebx + rounder_6 wrt ..gotoff] ; +rounder
-+ paddd mm0, [ebx + rounder_6 wrt ..gotoff] ; +rounder
- movq [eax+80+8], mm7 ; 7 ; save y7 y6
-- movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2)
- pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4
-@@ -338,12 +347,12 @@ idct_3dne:
- movq mm7, mm0 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0
- paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm0, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1
- pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -354,19 +363,19 @@ idct_3dne:
- movq [eax+96], mm6 ; 3 ; save y3 y2 y1 y0 stall2
-
- ; DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7
-- movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4
-- paddd mm3, [rounder_7] ; +rounder stall 6
-- paddd mm5, [rounder_7] ; +rounder
-+ paddd mm3, [ebx + rounder_7 wrt ..gotoff] ; +rounder stall 6
-+ paddd mm5, [ebx + rounder_7 wrt ..gotoff] ; +rounder
- movq [eax+96+8], mm7 ; 7 ; save y7 y6 y5 y4
-- movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2)
- pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4
-@@ -377,12 +386,12 @@ idct_3dne:
- movq mm7, mm5 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5
- paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm5, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1
- pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -393,19 +402,19 @@ idct_3dne:
- movq [eax+112], mm6 ; 3 ; save y3 y2 y1 y0
-
- ; DCT_8_INV_ROW_1_s [eax+0], 0, tab_i_04_xmm, rounder_0
-- movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6
-- paddd mm3, [rounder_0] ; +rounder
-- paddd mm0, [rounder_0] ; +rounder
-+ paddd mm3, [ebx + rounder_0 wrt ..gotoff] ; +rounder
-+ paddd mm0, [ebx + rounder_0 wrt ..gotoff] ; +rounder
- movq [eax+112+8], mm7 ; 7 ; save y7 y6
-- movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm0, mm1 ; 1
- pshufw mm1, [eax+16+8],10001000b ; x6 x4 x6 x4
-@@ -416,12 +425,12 @@ idct_3dne:
- movq mm7, mm0 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0
- paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm0, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1
- pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -432,19 +441,19 @@ idct_3dne:
- movq [eax+0], mm6 ; 3 ; save y3 y2 y1 y0 stall2
-
- ; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1
-- movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4
-- paddd mm3, [rounder_1] ; +rounder stall 6
-- paddd mm5, [rounder_1] ; +rounder
-+ paddd mm3, [ebx + rounder_1 wrt ..gotoff] ; +rounder stall 6
-+ paddd mm5, [ebx + rounder_1 wrt ..gotoff] ; +rounder
- movq [eax+0+8], mm7 ; 7 ; save y7 y6 y5 y4
-- movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2)
- pshufw mm1, [eax+32+8],10001000b ; x6 x4 x6 x4
-@@ -455,12 +464,12 @@ idct_3dne:
- movq mm7, mm5 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5
- paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm5, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1
- pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -471,19 +480,19 @@ idct_3dne:
- movq [eax+16], mm6 ; 3 ; save y3 y2 y1 y0
-
- ; DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2
-- movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6
-- paddd mm3, [rounder_2] ; +rounder
-- paddd mm0, [rounder_2] ; +rounder
-+ paddd mm3, [ebx + rounder_2 wrt ..gotoff] ; +rounder
-+ paddd mm0, [ebx + rounder_2 wrt ..gotoff] ; +rounder
- movq [eax+16+8], mm7 ; 7 ; save y7 y6
-- movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2)
- pshufw mm1, [eax+48+8],10001000b ; x6 x4 x6 x4
-@@ -494,12 +503,12 @@ idct_3dne:
- movq mm7, mm0 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0
- paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-- movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00
-+ movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00
- psubd mm7, mm2 ; ; a3-b3 a2-b2
- paddd mm0, mm2 ; 0 free a3+b3 a2+b2
- pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1
- pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00
-- pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
- psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0
- psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2
- psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -510,26 +519,26 @@ idct_3dne:
- movq [eax+32], mm6 ; 3 ; save y3 y2 y1 y0 stall2
-
- ; DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3
-- movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02
-- movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16
-+ movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02
-+ movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16
- pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4
-- paddd mm3, [rounder_3] ; +rounder stall 6
-- paddd mm5, [rounder_3] ; +rounder
-+ paddd mm3, [ebx + rounder_3 wrt ..gotoff] ; +rounder stall 6
-+ paddd mm5, [ebx + rounder_3 wrt ..gotoff] ; +rounder
- movq [eax+32+8], mm7 ; 7 ; save y7 y6 y5 y4
-- movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
-- pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
- pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
-- pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0)
- paddd mm5, mm1 ; mm1 free ; a3=sum(even3) a2=sum(even2)
-- movq mm1, [tg_3_16]
-+ movq mm1, [ebx + tg_3_16 wrt ..gotoff]
- movq mm4, mm3 ; 4 ; a1 a0
- paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0)
- paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2)
-- movq mm0, [tg_3_16]
-+ movq mm0, [ebx + tg_3_16 wrt ..gotoff]
- movq mm7, mm5 ; 7 ; a3 a2
- psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0
- paddd mm3, mm6 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
-@@ -542,7 +551,7 @@ idct_3dne:
- psrad mm2, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2
- movq mm6, [eax+16*1]
- packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5
-- movq mm4, [tg_1_16]
-+ movq mm4, [ebx + tg_1_16 wrt ..gotoff]
- packssdw mm3, mm2 ; 0 free ; y3 y2 y1 y0
- pshufw mm2, mm7, 10110001b ; y7 y6 y5 y4
-
-@@ -559,7 +568,7 @@ idct_3dne:
- paddsw mm1, mm3 ; x3+x5*(tg_3_16-1)
- psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35
- movq [eax+48], mm3 ; 3 ; save y3 y2 y1 y0
-- movq mm3, [ocos_4_16]
-+ movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
- paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35
- paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17
- psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17
-@@ -569,7 +578,7 @@ idct_3dne:
- psubsw mm6, mm0 ; tm17-tm35 = b3
- psubsw mm4, mm1 ; tp17-tp35 = t1
- paddsw mm2, mm0 ; tm17+tm35 = t2
-- movq mm7, [tg_2_16]
-+ movq mm7, [ebx + tg_2_16 wrt ..gotoff]
- movq mm1, mm4 ; t1
- movq [eax+3*16], mm5 ; save b0
- paddsw mm1, mm2 ; t1+t2
-@@ -620,7 +629,7 @@ idct_3dne:
- movq mm6, mm2 ; a3
- psraw mm4, SHIFT_INV_COL ; dst7
- movq [eax+5*16], mm0
-- movq mm0, [tg_3_16]
-+ movq mm0, [ebx + tg_3_16 wrt ..gotoff]
- paddsw mm2, mm3 ; a3+b3
- movq [eax+6*16], mm7
- psubsw mm6, mm3 ; a3-b3
-@@ -634,7 +643,7 @@ idct_3dne:
- movq mm5, [eax+8+16*5]
- psraw mm6, SHIFT_INV_COL ; dst4
- pmulhw mm0, mm3 ; x3*(tg_3_16-1)
-- movq mm4, [tg_1_16]
-+ movq mm4, [ebx + tg_1_16 wrt ..gotoff]
- pmulhw mm1, mm5 ; x5*(tg_3_16-1)
- movq mm7, [eax+8+16*7]
- movq [eax+3*16], mm2
-@@ -646,7 +655,7 @@ idct_3dne:
- pmulhw mm2, mm6 ; x1*tg_1_16
- paddsw mm1, mm3 ; x3+x5*(tg_3_16-1)
- psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35
-- movq mm3, [ocos_4_16]
-+ movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
- paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35
- paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17
- psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17
-@@ -655,7 +664,7 @@ idct_3dne:
- paddsw mm5, mm1 ; tp17+tp35 = b0
- psubsw mm4, mm1 ; tp17-tp35 = t1
- paddsw mm2, mm0 ; tm17+tm35 = t2
-- movq mm7, [tg_2_16]
-+ movq mm7, [ebx + tg_2_16 wrt ..gotoff]
- movq mm1, mm4 ; t1
- psubsw mm6, mm0 ; tm17-tm35 = b3
- movq [eax+8+3*16], mm5 ; save b0
-@@ -717,6 +726,7 @@ idct_3dne:
- movq [eax+8+3*16], mm2
- movq [eax+8+4*16], mm6
-
-+ pop ebx
- ret
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/idct_mmx.asm xvidcore-1.1.2/src/dct/x86_asm/idct_mmx.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/idct_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/idct_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -326,25 +326,25 @@ tab_i_35_xmm:
- punpcklwd mm0, mm1 ; x5 x1 x4 x0
- movq mm5, mm0 ; 5 ; x5 x1 x4 x0
- punpckldq mm0, mm0 ; x4 x0 x4 x0
-- movq mm4, [%3+8] ; 4 ; w07 w05 w03 w01
-+ movq mm4, [8+%3] ; 4 ; w07 w05 w03 w01
- punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2
- pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00
- movq mm6, mm2 ; 6 ; x7 x3 x6 x2
-- movq mm1, [%3+32] ; 1 ; w22 w20 w18 w16
-+ movq mm1, [32+%3] ; 1 ; w22 w20 w18 w16
- punpckldq mm2, mm2 ; x6 x2 x6 x2
- pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01
- punpckhdq mm5, mm5 ; x5 x1 x5 x1
-- pmaddwd mm0, [%3+16] ; x4*w14+x0*w12 x4*w10+x0*w08
-+ pmaddwd mm0, [16+%3] ; x4*w14+x0*w12 x4*w10+x0*w08
- punpckhdq mm6, mm6 ; x7 x3 x7 x3
-- movq mm7, [%3+40] ; 7 ; w23 w21 w19 w17
-+ movq mm7, [40+%3] ; 7 ; w23 w21 w19 w17
- pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16
- paddd mm3, [%4] ; +%4
- pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17
-- pmaddwd mm2, [%3+24] ; x6*w15+x2*w13 x6*w11+x2*w09
-+ pmaddwd mm2, [24+%3] ; x6*w15+x2*w13 x6*w11+x2*w09
- paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0)
-- pmaddwd mm5, [%3+48] ; x5*w30+x1*w28 x5*w26+x1*w24
-+ pmaddwd mm5, [48+%3] ; x5*w30+x1*w28 x5*w26+x1*w24
- movq mm4, mm3 ; 4 ; a1 a0
-- pmaddwd mm6, [%3+56] ; x7*w31+x3*w29 x7*w27+x3*w25
-+ pmaddwd mm6, [56+%3] ; x7*w31+x3*w29 x7*w27+x3*w25
- paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0)
- paddd mm0, [%4] ; +%4
- psubd mm3, mm1 ; a1-b1 a0-b0
-@@ -378,25 +378,25 @@ tab_i_35_xmm:
- movq mm2, mm0 ; 2 ; x3 x2 x1 x0
- movq mm3, [%3] ; 3 ; w05 w04 w01 w00
- pshufw mm0, mm0, 10001000b ; x2 x0 x2 x0
-- movq mm4, [%3+8] ; 4 ; w07 w06 w03 w02
-+ movq mm4, [8+%3] ; 4 ; w07 w06 w03 w02
- movq mm5, mm1 ; 5 ; x7 x6 x5 x4
- pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
-- movq mm6, [%3+32] ; 6 ; w21 w20 w17 w16
-+ movq mm6, [32+%3] ; 6 ; w21 w20 w17 w16
- pshufw mm1, mm1, 10001000b ; x6 x4 x6 x4
- pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
-- movq mm7, [%3+40] ; 7 ; w23 w22 w19 w18
-+ movq mm7, [40+%3] ; 7 ; w23 w22 w19 w18
- pshufw mm2, mm2, 11011101b ; x3 x1 x3 x1
- pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
- pshufw mm5, mm5, 11011101b ; x7 x5 x7 x5
- pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18
- paddd mm3, [%4] ; +%4
-- pmaddwd mm0, [%3+16] ; x2*w13+x0*w12 x2*w09+x0*w08
-+ pmaddwd mm0, [16+%3] ; x2*w13+x0*w12 x2*w09+x0*w08
- paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0)
-- pmaddwd mm1, [%3+24] ; x6*w15+x4*w14 x6*w11+x4*w10
-+ pmaddwd mm1, [24+%3] ; x6*w15+x4*w14 x6*w11+x4*w10
- movq mm4, mm3 ; 4 ; a1 a0
-- pmaddwd mm2, [%3+48] ; x3*w29+x1*w28 x3*w25+x1*w24
-+ pmaddwd mm2, [48+%3] ; x3*w29+x1*w28 x3*w25+x1*w24
- paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0)
-- pmaddwd mm5, [%3+56] ; x7*w31+x5*w30 x7*w27+x5*w26
-+ pmaddwd mm5, [56+%3] ; x7*w31+x5*w30 x7*w27+x5*w26
- paddd mm3, mm6 ; a1+b1 a0+b0
- paddd mm0, [%4] ; +%4
- psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0
-@@ -480,12 +480,12 @@ tab_i_35_xmm:
- ;-----------------------------------------------------------------------------
-
- %macro DCT_8_INV_COL 2
-- movq mm0, [tg_3_16]
-+ movq mm0, [ebx + tg_3_16 wrt ..gotoff]
- movq mm3, [%1+16*3]
- movq mm1, mm0 ; tg_3_16
- movq mm5, [%1+16*5]
- pmulhw mm0, mm3 ; x3*(tg_3_16-1)
-- movq mm4, [tg_1_16]
-+ movq mm4, [ebx + tg_1_16 wrt ..gotoff]
- pmulhw mm1, mm5 ; x5*(tg_3_16-1)
- movq mm7, [%1+16*7]
- movq mm2, mm4 ; tg_1_16
-@@ -495,7 +495,7 @@ tab_i_35_xmm:
- pmulhw mm2, mm6 ; x1*tg_1_16
- paddsw mm1, mm3 ; x3+x5*(tg_3_16-1)
- psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35
-- movq mm3, [ocos_4_16]
-+ movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
- paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35
- paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17
- psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17
-@@ -505,7 +505,7 @@ tab_i_35_xmm:
- psubsw mm6, mm0 ; tm17-tm35 = b3
- psubsw mm4, mm1 ; tp17-tp35 = t1
- paddsw mm2, mm0 ; tm17+tm35 = t2
-- movq mm7, [tg_2_16]
-+ movq mm7, [ebx + tg_2_16 wrt ..gotoff]
- movq mm1, mm4 ; t1
- ; movq [SCRATCH+0], mm5 ; save b0
- movq [%2+3*16], mm5 ; save b0
-@@ -577,6 +577,11 @@ tab_i_35_xmm:
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- cglobal idct_mmx
- cglobal idct_xmm
-
-@@ -586,22 +591,27 @@ cglobal idct_xmm
-
- ALIGN 16
- idct_mmx:
-- mov eax, dword [esp + 4]
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, dword [esp + 4 + 4]
-
- ;; Process each row
-- DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, tab_i_04_mmx, rounder_0
-- DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, tab_i_17_mmx, rounder_1
-- DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, tab_i_26_mmx, rounder_2
-- DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, tab_i_35_mmx, rounder_3
-- DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, tab_i_04_mmx, rounder_4
-- DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, tab_i_35_mmx, rounder_5
-- DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, tab_i_26_mmx, rounder_6
-- DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, tab_i_17_mmx, rounder_7
-+ DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_0 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_1 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_2 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_3 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_4 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_5 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_6 wrt ..gotoff
-+ DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_7 wrt ..gotoff
-
- ;; Process the columns (4 at a time)
- DCT_8_INV_COL eax+0, eax+0
- DCT_8_INV_COL eax+8, eax+8
-
-+ pop ebx
- ret
- .endfunc
-
-@@ -611,22 +621,27 @@ idct_mmx:
-
- ALIGN 16
- idct_xmm:
-- mov eax, dword [esp + 4]
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, dword [esp + 4 + 4]
-
- ;; Process each row
-- DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, tab_i_04_xmm, rounder_0
-- DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, tab_i_17_xmm, rounder_1
-- DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, tab_i_26_xmm, rounder_2
-- DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, tab_i_35_xmm, rounder_3
-- DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, tab_i_04_xmm, rounder_4
-- DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, tab_i_35_xmm, rounder_5
-- DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, tab_i_26_xmm, rounder_6
-- DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, tab_i_17_xmm, rounder_7
-+ DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_0 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_1 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_2 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_3 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_4 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_5 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_6 wrt ..gotoff
-+ DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_7 wrt ..gotoff
-
- ;; Process the columns (4 at a time)
- DCT_8_INV_COL eax+0, eax+0
- DCT_8_INV_COL eax+8, eax+8
-
-+ pop ebx
- ret
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/idct_sse2_dmitry.asm xvidcore-1.1.2/src/dct/x86_asm/idct_sse2_dmitry.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/idct_sse2_dmitry.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/idct_sse2_dmitry.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -183,7 +183,7 @@ cglobal idct_sse2_dmitry
-
- ;a 3210 first part
- pshufd xmm2, xmm1, 10101010b ;x 64646464
-- pmaddwd xmm2, [%3+16] ;w 15 14 11 10 7632
-+ pmaddwd xmm2, [16+%3] ;w 15 14 11 10 7632
-
- ;a 3210 second part
- paddd xmm2, xmm0 ;a 3210 ready
-@@ -191,11 +191,11 @@ cglobal idct_sse2_dmitry
- movdqa xmm5, xmm2
-
- pshufd xmm3, xmm1, 01010101b ;x 31313131
-- pmaddwd xmm3, [%3+32] ;w 29 28 25 24 21 20 17 16
-+ pmaddwd xmm3, [32+%3] ;w 29 28 25 24 21 20 17 16
-
- ;b 3210 first part
- pshufd xmm4, xmm1, 11111111b ;x 75757575
-- pmaddwd xmm4, [%3+48] ;w 31 30 27 26 23 22 19 18
-+ pmaddwd xmm4, [48+%3] ;w 31 30 27 26 23 22 19 18
-
- ;b 3210 second part
- paddd xmm3,xmm4 ;b 3210 ready
-@@ -220,7 +220,7 @@ cglobal idct_sse2_dmitry
-
- movdqa xmm4, [%1+16*2] ;x2
- movdqa xmm5, [%1+16*6] ;x6
-- movdqa xmm6, [tg_2_16]
-+ movdqa xmm6, [ebx + tg_2_16 wrt ..gotoff]
- movdqa xmm7, xmm6
-
- paddsw xmm0, xmm2 ;u04=x0+x4
-@@ -245,12 +245,12 @@ cglobal idct_sse2_dmitry
-
- movdqa xmm0, [%1+16*1] ;x1
- movdqa xmm1, [%1+16*7] ;x7
-- movdqa xmm2, [tg_1_16]
-+ movdqa xmm2, [ebx + tg_1_16 wrt ..gotoff]
- movdqa xmm3, xmm2
-
- movdqa xmm4, [%1+16*3] ;x3
- movdqa xmm5, [%1+16*5] ;x5
-- movdqa xmm6, [tg_3_16]
-+ movdqa xmm6, [ebx + tg_3_16 wrt ..gotoff]
- movdqa xmm7, xmm6
-
- pmulhw xmm2, xmm0
-@@ -267,7 +267,7 @@ cglobal idct_sse2_dmitry
- psubsw xmm6, xmm5 ;v35=x3*T3-x5
- paddsw xmm7, xmm4 ;u35=x5*T3+x3
-
-- movdqa xmm4, [ocos_4_16]
-+ movdqa xmm4, [ebx + ocos_4_16 wrt ..gotoff]
-
- paddsw xmm0, xmm7 ;b0=u17+u35
- psubsw xmm1, xmm6 ;b3=v17-v35
-@@ -322,26 +322,35 @@ cglobal idct_sse2_dmitry
- movdqa [%2+16*5], xmm7
- %endmacro
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- ;-----------------------------------------------------------------------------
- ; void idct_sse2_dmitry(int16_t coeff[64]);
- ;-----------------------------------------------------------------------------
-
- ALIGN 16
- idct_sse2_dmitry:
--
-- mov eax, [esp + 4]
--
-- DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, tab_i_04, rounder_2_0
-- DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, tab_i_17, rounder_2_1
-- DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, tab_i_26, rounder_2_2
-- DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, tab_i_35, rounder_2_3
-- DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, tab_i_04, rounder_2_4
-- DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, tab_i_35, rounder_2_5
-- DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, tab_i_26, rounder_2_6
-- DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, tab_i_17, rounder_2_7
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 4 + 4]
-+
-+ DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_0 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_1 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_2 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_3 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_4 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_5 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_6 wrt ..gotoff
-+ DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_7 wrt ..gotoff
-
- DCT_8_INV_COL_4_SSE2 eax, eax
-
-+ pop ebx
- ret
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/simple_idct_mmx.asm xvidcore-1.1.2/src/dct/x86_asm/simple_idct_mmx.asm
---- xvidcore-1.1.2-old/src/dct/x86_asm/simple_idct_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/dct/x86_asm/simple_idct_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -122,7 +122,7 @@ coeffs:
- movq mm1,[src4] ; R6 R2 r6 r2
- movq mm2,[src1] ; R3 R1 r3 r1
- movq mm3,[src5] ; R7 R5 r7 r5
-- movq mm4,[wm1010]
-+ movq mm4,[ebx + wm1010 wrt ..gotoff]
- pand mm4,mm0
- por mm4,mm1
- por mm4,mm2
-@@ -131,29 +131,29 @@ coeffs:
- movd eax,mm4
- or eax,eax
- jz near .skip1
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm5,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2
-- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6
-+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6
- pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2
-- movq mm7,[coeffs+48] ; C3 C1 C3 C1
-+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1
- pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
- rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4,mm5 ; A0 a0
- psubd mm6,mm5 ; A3 a3
-- movq mm5,[coeffs+56] ; C7 C5 C7 C5
-+ movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5
- pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5
- rounder_op mm0, rounder_arg
- paddd mm1,mm0 ; A1 a1
- paddd mm0,mm0
- psubd mm0,mm1 ; A2 a2
-- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1
-+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7,mm5 ; B0 b0
-- movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1
-+ movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1
- pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -170,14 +170,14 @@ coeffs:
- packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
- movq [dst],mm7
- movq mm1,[src1] ; R3 R1 r3 r1
-- movq mm4,[coeffs+80] ;-C1 C5 -C1 C5
-+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ;-C1 C5 -C1 C5
- movq [dst + 24],mm2
- pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1
-- movq mm7,[coeffs+88] ; C3 C7 C3 C7
-- pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
-+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7
-+ pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2,mm0 ; A2 a2
-- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5
-+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4,mm7 ; B2 b2
- paddd mm2,mm4 ; A2+B2 a2+b2
- psubd mm0,mm4 ; a2-B2 a2-b2
-@@ -196,7 +196,7 @@ coeffs:
- jmp short .skip2
- .skip1
- pslld mm0,16
-- paddd mm0,[d40000]
-+ paddd mm0,[ebx + d40000 wrt ..gotoff]
- psrad mm0,13
- packssdw mm0,mm0
- movq [ dst ],mm0
-@@ -240,29 +240,29 @@ coeffs:
- movd eax,mm4
- or eax,eax
- jz near bt
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm5,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2
-- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6
-+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6
- pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2
-- movq mm7,[coeffs+48] ; C3 C1 C3 C1
-+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1
- pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
- rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4,mm5 ; A0 a0
- psubd mm6,mm5 ; A3 a3
-- movq mm5,[coeffs+56] ; C7 C5 C7 C5
-+ movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5
- pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5
- rounder_op mm0, rounder_arg
- paddd mm1,mm0 ; A1 a1
- paddd mm0,mm0
- psubd mm0,mm1 ; A2 a2
-- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1
-+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7,mm5 ; B0 b0
-- movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1
-+ movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1
- pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -279,14 +279,14 @@ coeffs:
- packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
- movq [ dst ],mm7
- movq mm1,[src1] ; R3 R1 r3 r1
-- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5
-+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5
- movq [ dst + 24 ],mm2
- pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1
-- movq mm7,[coeffs+88] ; C3 C7 C3 C7
-- pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
-+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7
-+ pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2,mm0 ; A2 a2
-- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5
-+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4,mm7 ; B2 b2
- paddd mm2,mm4 ; A2+B2 a2+b2
- psubd mm0,mm4 ; a2-B2 a2-b2
-@@ -330,17 +330,17 @@ coeffs:
- movq mm1,[src4] ; R6 R2 r6 r2
- movq mm2,[src1] ; R3 R1 r3 r1
- movq mm3,[src5] ; R7 R5 r7 r5
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm5,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2
-- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6
-+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6
- pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
-- movq mm7,[coeffs+48] ; C3 C1 C3 C1
-+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1
- ; rounder_op mm0, rounder_arg
- pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4,mm5 ; A0 a0
-@@ -348,11 +348,11 @@ coeffs:
- movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0,mm1 ; A1 a1
- psubd mm5,mm1 ; A2 a2
-- movq mm1,[coeffs+56] ; C7 C5 C7 C5
-+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5
- pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5
-- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1
-+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7,mm1 ; B0 b0
-- movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1
-+ movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1
- pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -374,13 +374,13 @@ coeffs:
- packssdw mm4,mm4 ; A0-B0 a0-b0
- movd [ dst + 112],mm4
- movq mm0,[src1] ; R3 R1 r3 r1
-- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5
-+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5
- pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1
-- movq mm7,[coeffs+88] ; C3 C7 C3 C7
-- pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
-+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7
-+ pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2,mm5 ; A2 a2
-- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5
-+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4,mm7 ; B2 b2
- paddd mm2,mm4 ; A2+B2 a2+b2
- psubd mm5,mm4 ; a2-B2 a2-b2
-@@ -426,13 +426,13 @@ coeffs:
- movq mm0,[src0] ; R4 R0 r4 r0
- movq mm1,[src4] ; R6 R2 r6 r2
- movq mm3,[src5] ; R7 R5 r7 r5
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm5,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2
-- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6
-+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6
- pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
-@@ -442,9 +442,9 @@ coeffs:
- movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0,mm1 ; A1 a1
- psubd mm5,mm1 ; A2 a2
-- movq mm1,[coeffs+56] ; C7 C5 C7 C5
-+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5
- pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5
-- movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1
-+ movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1
- pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm1,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -464,10 +464,10 @@ coeffs:
- movd [ dst + 96 ],mm2
- packssdw mm4,mm4 ; A0-B0 a0-b0
- movd [ dst + 112 ],mm4
-- movq mm1,[coeffs+88] ; C3 C7 C3 C7
-+ movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7
- pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2,mm5 ; A2 a2
-- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5
-+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm2,mm1 ; A2+B2 a2+b2
- psubd mm5,mm1 ; a2-B2 a2-b2
- psrad mm2,shift
-@@ -510,17 +510,17 @@ coeffs:
- %define shift %8
- movq mm0,[src0] ; R4 R0 r4 r0
- movq mm3,[src5] ; R7 R5 r7 r5
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
- ; rounder_op mm0, rounder_arg
- movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm1,[coeffs+56] ; C7 C5 C7 C5
-+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5
- pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5
-- movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1
-+ movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1
- pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm1,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -540,10 +540,10 @@ coeffs:
- movd [ dst + 96 ],mm2
- packssdw mm4,mm4 ; A0-B0 a0-b0
- movd [ dst + 112 ],mm4
-- movq mm1,[coeffs+88] ; C3 C7 C3 C7
-+ movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7
- pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2,mm5 ; A2 a2
-- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5
-+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm2,mm1 ; A2+B2 a2+b2
- psubd mm5,mm1 ; a2-B2 a2-b2
- psrad mm2,shift
-@@ -587,21 +587,21 @@ coeffs:
- movq mm0,[src0] ; R4 R0 r4 r0
- movq mm2,[src1] ; R3 R1 r3 r1
- movq mm3,[src5] ; R7 R5 r7 r5
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
-- movq mm7,[coeffs+48] ; C3 C1 C3 C1
-+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1
- ; rounder_op mm0, rounder_arg
- pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
- movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm1,[coeffs+56] ; C7 C5 C7 C5
-+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5
- pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5
-- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1
-+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7,mm1 ; B0 b0
-- movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1
-+ movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1
- pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -623,13 +623,13 @@ coeffs:
- packssdw mm4,mm4 ; A0-B0 a0-b0
- movd [dst + 112],mm4
- movq mm0,[src1] ; R3 R1 r3 r1
-- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5
-+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5
- pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1
-- movq mm7,[coeffs+88] ; C3 C7 C3 C7
-- pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
-+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7
-+ pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2,mm5 ; A2 a2
-- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5
-+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4,mm7 ; B2 b2
- paddd mm2,mm4 ; A2+B2 a2+b2
- psubd mm5,mm4 ; a2-B2 a2-b2
-@@ -674,17 +674,17 @@ coeffs:
- %define shift %8
- movq mm0,[src0] ; R4 R0 r4 r0
- movq mm2,[src1] ; R3 R1 r3 r1
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
-- movq mm7,[coeffs+48] ; C3 C1 C3 C1
-+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1
- ; rounder_op mm0, rounder_arg
- pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
- movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm3,[coeffs+64]
-+ movq mm3,[ebx + coeffs+64 wrt ..gotoff]
- pmaddwd mm3,mm2 ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -704,9 +704,9 @@ coeffs:
- movd [dst + 96],mm1
- packssdw mm4,mm4 ; A0-B0 a0-b0
- movd [dst + 112],mm4
-- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5
-+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5
- pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1
-- pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
-+ pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1
- movq mm1,mm5 ; A2 a2
- paddd mm1,mm4 ; A2+B2 a2+b2
- psubd mm5,mm4 ; a2-B2 a2-b2
-@@ -750,13 +750,13 @@ coeffs:
- %define shift %8
- movq mm0,[src0] ; R4 R0 r4 r0
- movq mm1,[src4] ; R6 R2 r6 r2
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm5,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2
-- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6
-+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6
- pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
-@@ -768,13 +768,13 @@ coeffs:
- psubd mm5,mm1 ; A2 a2
- movq mm2,[src0 + 8] ; R4 R0 r4 r0
- movq mm3,[src4 + 8] ; R6 R2 r6 r2
-- movq mm1,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0
-- movq mm7,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm7,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm7,mm3 ; C6R6+C2R2 C6r6+C2r2
-- pmaddwd mm3,[coeffs+40] ; -C2R6+C6R2 -C2r6+C6r2
-+ pmaddwd mm3,[ebx + coeffs+40 wrt ..gotoff] ; -C2R6+C6R2 -C2r6+C6r2
- ; rounder_op mm1, rounder_arg
- paddd mm7,mm1 ; A0 a0
- paddd mm1,mm1 ; 2C0 2c0
-@@ -829,17 +829,17 @@ coeffs:
- movq mm0,[src0] ; R4 R0 r4 r0
- movq mm1,[src4] ; R6 R2 r6 r2
- movq mm2,[src1] ; R3 R1 r3 r1
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm5,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2
-- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6
-+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6
- pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2
- ; rounder_op mm4, rounder_arg
- movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0
-- movq mm7,[coeffs+48] ; C3 C1 C3 C1
-+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1
- ; rounder_op mm0, rounder_arg
- pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4,mm5 ; A0 a0
-@@ -847,7 +847,7 @@ coeffs:
- movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0,mm1 ; A1 a1
- psubd mm5,mm1 ; A2 a2
-- movq mm1,[coeffs+64]
-+ movq mm1,[ebx + coeffs+64 wrt ..gotoff]
- pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7,mm4 ; A0+B0 a0+b0
- paddd mm4,mm4 ; 2A0 2a0
-@@ -867,9 +867,9 @@ coeffs:
- movd [dst + 96],mm3
- packssdw mm4,mm4 ; A0-B0 a0-b0
- movd [dst + 112],mm4
-- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5
-+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5
- pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1
-- pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
-+ pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1
- movq mm3,mm5 ; A2 a2
- paddd mm3,mm4 ; A2+B2 a2+b2
- psubd mm5,mm4 ; a2-B2 a2-b2
-@@ -912,20 +912,20 @@ coeffs:
- %define rounder_arg %7
- %define shift %8
- movq mm0,[src0] ; R4 R0 r4 r0
-- movq mm4,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
-- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
- ; rounder_op mm4, rounder_arg
- ; rounder_op mm0, rounder_arg
- psrad mm4,shift
- psrad mm0,shift
- movq mm2,[src0 + 8] ; R4 R0 r4 r0
-- movq mm1,[coeffs+16] ; C4 C4 C4 C4
-+ movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4
- pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0
-- movq mm7,[coeffs+24] ; -C4 C4 -C4 C4
-+ movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4
- pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0
-- movq mm7,[coeffs+32] ; C6 C2 C6 C2
-+ movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2
- ; rounder_op mm1, rounder_arg
- ; rounder_op mm2, rounder_arg
- psrad mm1,shift
-@@ -1073,6 +1073,11 @@ coeffs:
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bx:
-+ mov ebx, [esp]
-+ retn
-+
- cglobal simple_idct_mmx_P
- cglobal simple_idct_mmx
-
-@@ -1083,14 +1088,18 @@ cglobal simple_idct_mmx
-
- ALIGN 16
- simple_idct_mmx_P:
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- sub esp, 128
-- mov edx, [esp+128+4]
-+ mov edx, [esp+128+4+4]
-
- ; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt
-- DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11
-- Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four
-- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .one
-+ DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11
-+ Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .four
-+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .two
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .one
- IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1099,8 +1108,8 @@ simple_idct_mmx_P:
-
- ALIGN 16
- .four
-- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five
-+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .six
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .five
- IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1109,7 +1118,7 @@ ALIGN 16
-
- ALIGN 16
- .six
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .seven
- IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1118,7 +1127,7 @@ ALIGN 16
-
- ALIGN 16
- .two
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .three
- IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1159,6 +1168,7 @@ ALIGN 16
- .ret
- add esp, 128
-
-+ pop ebx
- ret
- .endfunc
-
-@@ -1174,15 +1184,19 @@ ALIGN 16
-
- ALIGN 16
- simple_idct_mmx:
-+ push ebx
-+ call get_pc.bx
-+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- sub esp, 128
-- mov edx, [esp+128+4]
-+ mov edx, [esp+128+4+4]
- PERMUTEP edx ; permute parm list in place
-
- ; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt
-- DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11
-- Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP
-- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP
-+ DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11
-+ Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .fourP
-+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .twoP
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .oneP
- IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1191,8 +1205,8 @@ simple_idct_mmx:
-
- ALIGN 16
- .fourP
-- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP
-+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .sixP
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .fiveP
- IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1201,7 +1215,7 @@ ALIGN 16
-
- ALIGN 16
- .sixP
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .sevenP
- IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1210,7 +1224,7 @@ ALIGN 16
-
- ALIGN 16
- .twoP
-- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP
-+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .threeP
- IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
- IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
- IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
-@@ -1251,6 +1265,7 @@ ALIGN 16
- .retP
- add esp, 128
-
-+ pop ebx
- ret
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/colorspace_mmx.inc xvidcore-1.1.2/src/image/x86_asm/colorspace_mmx.inc
---- xvidcore-1.1.2-old/src/image/x86_asm/colorspace_mmx.inc 2004-08-29 12:02:38.000000000 +0200
-+++ xvidcore-1.1.2/src/image/x86_asm/colorspace_mmx.inc 2007-01-27 16:34:11.000000000 +0100
-@@ -56,11 +56,14 @@ NAME:
- push edi ; esp + localsize + 4
- push ebp ; esp + localsize + 0
-
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- %define x_dif esp + localsize - 4
- %define y_dif esp + localsize - 8
- %define uv_dif esp + localsize - 12
- %define fixed_width esp + localsize - 16
--%define tmp_height esp + localsize - 20
-+%define tmp_fixed_width esp + localsize - 20
-
- sub esp, localsize
-
-@@ -90,8 +93,6 @@ NAME:
- mov esi, [y_ptr] ; $esi$ = y_ptr
- mov edi, [x_ptr] ; $edi$ = x_ptr
- mov edx, [x_stride] ; $edx$ = x_stride
-- mov ebp, [height] ; $ebp$ = height
--
-
- mov ebx, [vflip]
- or ebx, ebx
-@@ -106,7 +107,7 @@ NAME:
- sub ebx, edx
- mov [x_dif], ebx ; x_dif = -BYTES*fixed_width - x_stride
-
-- mov eax, ebp
-+ mov eax, [height]
- sub eax, 1
- push edx
- mul edx
-@@ -126,8 +127,8 @@ NAME:
- FUNC %+ _INIT ARG1, ARG2 ; call FUNC_INIT
-
- .y_loop
-- mov [tmp_height], ebp
-- mov ebp, [fixed_width]
-+ push dword [fixed_width]
-+ pop dword [tmp_fixed_width]
-
- .x_loop
- FUNC ARG1, ARG2 ; call FUNC
-@@ -137,10 +138,9 @@ NAME:
- add ebx, PIXELS/2 ; u_ptr += PIXELS/2
- add ecx, PIXELS/2 ; v_ptr += PIXELS/2
-
-- sub ebp, PIXELS ; $ebp$ -= PIXELS
-+ sub dword [tmp_fixed_width], PIXELS ; $ebp$ -= PIXELS
- jg .x_loop ; if ($ebp$ > 0) goto .x_loop
-
-- mov ebp, [tmp_height]
- add edi, [x_dif] ; x_ptr += x_dif + (VPIXELS-1)*x_stride
- add esi, [y_dif] ; y_ptr += y_dif + (VPIXELS-1)*y_stride
- %rep VPIXELS-1
-@@ -155,7 +155,7 @@ NAME:
- add ecx, [uv_stride]
- %endrep
-
-- sub ebp, VPIXELS ; $ebp$ -= VPIXELS
-+ sub dword [height], VPIXELS ; $ebp$ -= VPIXELS
- jg .y_loop ; if ($ebp$ > 0) goto .y_loop
-
- ; cleanup stack & undef everything
-@@ -181,7 +181,6 @@ NAME:
- %undef y_dif
- %undef uv_dif
- %undef fixed_width
--%undef tmp_height
- ret
- .endfunc
- %undef NAME
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/colorspace_rgb_mmx.asm xvidcore-1.1.2/src/image/x86_asm/colorspace_rgb_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/colorspace_rgb_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/colorspace_rgb_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -120,7 +120,7 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
- ;------------------------------------------------------------------------------
-
- %macro BGR_TO_YV12_INIT 2
-- movq mm7, [y_mul]
-+ movq mm7, [ebp + y_mul wrt ..gotoff]
- %endmacro
-
-
-@@ -184,8 +184,8 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
-
- ; u_ptr, v_ptr
- movq mm0, mm6 ; = [ |b4|g4|r4]
-- pmaddwd mm6, [v_mul] ; *= V_MUL
-- pmaddwd mm0, [u_mul] ; *= U_MUL
-+ pmaddwd mm6, [ebp + v_mul wrt ..gotoff] ; *= V_MUL
-+ pmaddwd mm0, [ebp + u_mul wrt ..gotoff] ; *= U_MUL
- movq mm1, mm0
- movq mm2, mm6
- psrlq mm1, 32
-@@ -230,30 +230,30 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
- movd mm3, [ecx] ; v_ptr[0]
- punpcklbw mm2, mm7 ; u3u2u1u0 -> mm2
- punpcklbw mm3, mm7 ; v3v2v1v0 -> mm3
-- psubsw mm2, [U_SUB] ; U - 128
-- psubsw mm3, [V_SUB] ; V - 128
-+ psubsw mm2, [ebp + U_SUB wrt ..gotoff] ; U - 128
-+ psubsw mm3, [ebp + V_SUB wrt ..gotoff] ; V - 128
- movq mm4, mm2
- movq mm5, mm3
-- pmullw mm2, [UG_MUL]
-- pmullw mm3, [VG_MUL]
-+ pmullw mm2, [ebp + UG_MUL wrt ..gotoff]
-+ pmullw mm3, [ebp + VG_MUL wrt ..gotoff]
- movq mm6, mm2 ; u3u2u1u0 -> mm6
- punpckhwd mm2, mm2 ; u3u3u2u2 -> mm2
- punpcklwd mm6, mm6 ; u1u1u0u0 -> mm6
-- pmullw mm4, [UB_MUL] ; B_ADD -> mm4
-+ pmullw mm4, [ebp + UB_MUL wrt ..gotoff] ; B_ADD -> mm4
- movq mm0, mm3
- punpckhwd mm3, mm3 ; v3v3v2v2 -> mm2
- punpcklwd mm0, mm0 ; v1v1v0v0 -> mm6
- paddsw mm2, mm3
- paddsw mm6, mm0
-- pmullw mm5, [VR_MUL] ; R_ADD -> mm5
-+ pmullw mm5, [ebp + VR_MUL wrt ..gotoff] ; R_ADD -> mm5
- movq mm0, [esi] ; y7y6y5y4y3y2y1y0 -> mm0
- movq mm1, mm0
- punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1
- punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0
-- psubsw mm0, [Y_SUB] ; Y - Y_SUB
-- psubsw mm1, [Y_SUB] ; Y - Y_SUB
-- pmullw mm1, [Y_MUL]
-- pmullw mm0, [Y_MUL]
-+ psubsw mm0, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB
-+ psubsw mm1, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB
-+ pmullw mm1, [ebp + Y_MUL wrt ..gotoff]
-+ pmullw mm0, [ebp + Y_MUL wrt ..gotoff]
- movq [TEMP_Y2], mm1 ; y7y6y5y4 -> mm3
- movq [TEMP_Y1], mm0 ; y3y2y1y0 -> mm7
- psubsw mm1, mm2 ; g7g6g5g4 -> mm1
-@@ -266,10 +266,10 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
- movq mm1, mm0
- punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1
- punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0
-- psubsw mm0, [Y_SUB] ; Y - Y_SUB
-- psubsw mm1, [Y_SUB] ; Y - Y_SUB
-- pmullw mm1, [Y_MUL]
-- pmullw mm0, [Y_MUL]
-+ psubsw mm0, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB
-+ psubsw mm1, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB
-+ pmullw mm1, [ebp + Y_MUL wrt ..gotoff]
-+ pmullw mm0, [ebp + Y_MUL wrt ..gotoff]
- movq mm3, mm1
- psubsw mm1, mm2 ; g7g6g5g4 -> mm1
- movq mm2, mm0
-@@ -419,6 +419,11 @@ BRIGHT: db 128, 128, 128, 128, 128, 128,
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+ mov ebp, [esp]
-+ retn
-+
- %include "colorspace_mmx.inc"
-
- ; input
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/colorspace_yuyv_mmx.asm xvidcore-1.1.2/src/image/x86_asm/colorspace_yuyv_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/colorspace_yuyv_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/colorspace_yuyv_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -76,7 +76,7 @@ mmx_one: dw 1, 1, 1, 1
- ;-----------------------------------------------------------------------------
-
- %macro YUYV_TO_YV12_INIT 2
-- movq mm7, [yuyv_mask]
-+ movq mm7, [ebp + yuyv_mask wrt ..gotoff]
- %endmacro
-
-
-@@ -108,8 +108,8 @@ mmx_one: dw 1, 1, 1, 1
- pand mm5, mm7
- pand mm6, mm7
- paddw mm5, mm6
-- paddw mm4, [mmx_one] ; +1 rounding
-- paddw mm5, [mmx_one] ;
-+ paddw mm4, [ebp + mmx_one wrt ..gotoff] ; +1 rounding
-+ paddw mm5, [ebp + mmx_one wrt ..gotoff] ;
- psrlw mm4, 1
- psrlw mm5, 1
- ;---[ 3dnow/xmm ]----------------------------------------------------
-@@ -310,6 +310,11 @@ mmx_one: dw 1, 1, 1, 1
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+ mov ebp, [esp]
-+ retn
-+
- %include "colorspace_mmx.inc"
-
- ; input
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dn.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dn.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dn.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dn.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -44,20 +44,6 @@ BITS 32
- %endmacro
-
- ;=============================================================================
--; Read Only data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--mmx_one:
-- times 8 db 1
--
--;=============================================================================
- ; Code
- ;=============================================================================
-
-@@ -132,7 +118,10 @@ interpolate8x8_halfpel_h_3dn:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- COPY_H_3DN_RND1
- lea ecx, [ecx+2*edx]
- COPY_H_3DN_RND1
-@@ -206,7 +195,10 @@ interpolate8x8_halfpel_v_3dn:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- movq mm2, [eax] ; loop invariant
- add eax, edx
-
-@@ -329,7 +321,10 @@ interpolate8x8_halfpel_hv_3dn
- mov eax, [esp+ 8] ; Src
- mov edx, [esp+12] ; stride
-
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
-
- ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
- movq mm2, [eax]
-@@ -387,7 +382,10 @@ interpolate8x4_halfpel_h_3dn:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- COPY_H_3DN_RND1
- lea ecx, [ecx+2*edx]
- COPY_H_3DN_RND1
-@@ -424,7 +422,10 @@ interpolate8x4_halfpel_v_3dn:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- movq mm2, [eax] ; loop invariant
- add eax, edx
-
-@@ -462,7 +463,10 @@ interpolate8x4_halfpel_hv_3dn
- mov eax, [esp+ 8] ; Src
- mov edx, [esp+12] ; stride
-
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
-
- ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
- movq mm2, [eax]
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dne.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dne.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_3dne.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_3dne.asm 2007-01-27 17:25:51.000000000 +0100
-@@ -45,24 +45,6 @@ BITS 32
- %endmacro
-
- ;=============================================================================
--; Read only data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--mmx_one:
-- times 8 db 1
--
--ALIGN 8
--mm_minusone:
-- dd -1,-1
--
--;=============================================================================
- ; Macros
- ;=============================================================================
-
-@@ -149,7 +131,10 @@ interpolate8x8_halfpel_h_3dne:
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
- mov ecx, [esp+ 4] ; Dst
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- COPY_H_SSE_RND1
- lea ecx, [ecx+2*edx]
- COPY_H_SSE_RND1
-@@ -223,15 +208,15 @@ ALIGN 8
- psubusb mm0, [eax]
- add eax, edx
- mov ecx, [esp+ 4] ; Dst
-- push esi
-+ push byte -1
-+ push byte -1
- pcmpeqb mm1, mm1
- pcmpeqb mm2, mm2
-- mov esi, mm_minusone
- psubusb mm1, [byte eax]
- psubusb mm2, [eax+edx]
- lea eax, [eax+2*edx]
-- movq mm6, [esi]
-- movq mm7, [esi]
-+ movq mm6, [esp]
-+ movq mm7, [esp]
- pavgb mm0, mm1
- pavgb mm1, mm2
- psubusb mm6, mm0
-@@ -246,8 +231,8 @@ ALIGN 8
- lea eax, [eax+2*edx]
- pavgb mm2, mm3
- pavgb mm3, mm4
-- movq mm0, [esi]
-- movq mm1, [esi]
-+ movq mm0, [esp]
-+ movq mm1, [esp]
- psubusb mm0, mm2
- psubusb mm1, mm3
- movq [ecx], mm0
-@@ -261,8 +246,8 @@ ALIGN 8
- lea eax, [eax+2*edx]
- pavgb mm4, mm5
- pavgb mm5, mm6
-- movq mm2, [esi]
-- movq mm3, [esi]
-+ movq mm2, [esp]
-+ movq mm3, [esp]
- psubusb mm2, mm4
- psubusb mm3, mm5
- movq [ecx], mm2
-@@ -274,10 +259,10 @@ ALIGN 8
- psubusb mm0, [eax+edx]
- pavgb mm6, mm7
- pavgb mm7, mm0
-- movq mm4, [esi]
-- movq mm5, [esi]
-+ movq mm4, [esp]
-+ movq mm5, [esp]
- psubusb mm4, mm6
-- pop esi
-+ add esp, byte 8
- psubusb mm5, mm7
- movq [ecx], mm4
- movq [ecx+edx], mm5
-@@ -391,7 +376,10 @@ interpolate8x8_halfpel_hv_3dne:
- pavgb mm2, mm3
- pxor mm3, mm6 ; mm2/mm3 ready
- mov ecx, [esp+ 4] ; Dst
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
-
- jz near .rounding1
- lea ebp,[byte ebp]
-@@ -443,7 +431,10 @@ interpolate8x4_halfpel_h_3dne:
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
- mov ecx, [esp+ 4] ; Dst
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- COPY_H_SSE_RND1
- lea ecx, [ecx+2*edx]
- COPY_H_SSE_RND1
-@@ -501,16 +492,15 @@ ALIGN 8
- add eax, edx ; eax==line1
- mov ecx, [esp+ 4] ; Dst
-
-- push esi
--
- pcmpeqb mm1, mm1
- pcmpeqb mm2, mm2
-- mov esi, mm_minusone
-+ push byte -1
-+ push byte -1
- psubusb mm1, [byte eax] ; line1
- psubusb mm2, [eax+edx] ; line2
- lea eax, [eax+2*edx] ; eax==line3
-- movq mm6, [esi]
-- movq mm7, [esi]
-+ movq mm6, [esp]
-+ movq mm7, [esp]
- pavgb mm0, mm1
- pavgb mm1, mm2
- psubusb mm6, mm0
-@@ -526,15 +516,13 @@ ALIGN 8
- lea eax, [eax+2*edx] ; eax==line 5
- pavgb mm2, mm3
- pavgb mm3, mm4
-- movq mm0, [esi]
-- movq mm1, [esi]
-+ movq mm0, [esp]
-+ movq mm1, [esp]
- psubusb mm0, mm2
- psubusb mm1, mm3
- movq [ecx], mm0
- movq [ecx+edx], mm1
-
-- pop esi
--
- ret
-
- .endfunc
-@@ -562,7 +550,10 @@ interpolate8x4_halfpel_hv_3dne:
- pavgb mm2, mm3
- pxor mm3, mm6 ; mm2/mm3 ready
- mov ecx, [esp+ 4] ; Dst
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ lea esp, [esp + 8]
-
- jz near .rounding1
- lea ebp,[byte ebp]
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_mmx.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -166,13 +166,17 @@ interpolate8x8_halfpel_h_mmx:
-
- push esi
- push edi
-- mov eax, [esp + 8 + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-
-- movq mm7, [rounding1_mmx + eax * 8]
-+ mov eax, [esp + 12 + 16] ; rounding
-
-- mov edi, [esp + 8 + 4] ; dst
-- mov esi, [esp + 8 + 8] ; src
-- mov edx, [esp + 8 + 12] ; stride
-+ movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
-+
-+ mov edi, [esp + 12 + 4] ; dst
-+ mov esi, [esp + 12 + 8] ; src
-+ mov edx, [esp + 12 + 12] ; stride
-
- pxor mm6, mm6 ; zero
-
-@@ -185,6 +189,7 @@ interpolate8x8_halfpel_h_mmx:
- COPY_H_MMX
- COPY_H_MMX
-
-+ pop ebp
- pop edi
- pop esi
-
-@@ -225,13 +230,17 @@ interpolate8x8_halfpel_v_mmx:
- push esi
- push edi
-
-- mov eax, [esp + 8 + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 12 + 16] ; rounding
-
-- movq mm7, [rounding1_mmx + eax * 8]
-+ movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
-
-- mov edi, [esp + 8 + 4] ; dst
-- mov esi, [esp + 8 + 8] ; src
-- mov edx, [esp + 8 + 12] ; stride
-+ mov edi, [esp + 12 + 4] ; dst
-+ mov esi, [esp + 12 + 8] ; src
-+ mov edx, [esp + 12 + 12] ; stride
-
- pxor mm6, mm6 ; zero
-
-@@ -245,6 +254,7 @@ interpolate8x8_halfpel_v_mmx:
- COPY_V_MMX
- COPY_V_MMX
-
-+ pop ebp
- pop edi
- pop esi
-
-@@ -315,18 +325,22 @@ interpolate8x8_halfpel_hv_mmx:
- push esi
- push edi
-
-- mov eax, [esp + 8 + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-
-- movq mm7, [rounding2_mmx + eax * 8]
-+ mov eax, [esp + 12 + 16] ; rounding
-
-- mov edi, [esp + 8 + 4] ; dst
-- mov esi, [esp + 8 + 8] ; src
-+ movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff]
-+
-+ mov edi, [esp + 12 + 4] ; dst
-+ mov esi, [esp + 12 + 8] ; src
-
- mov eax, 8
-
- pxor mm6, mm6 ; zero
-
-- mov edx, [esp + 8 + 12] ; stride
-+ mov edx, [esp + 12 + 12] ; stride
-
- COPY_HV_MMX
- COPY_HV_MMX
-@@ -337,6 +351,7 @@ interpolate8x8_halfpel_hv_mmx:
- COPY_HV_MMX
- COPY_HV_MMX
-
-+ pop ebp
- pop edi
- pop esi
-
-@@ -357,13 +372,18 @@ interpolate8x4_halfpel_h_mmx:
-
- push esi
- push edi
-- mov eax, [esp + 8 + 16] ; rounding
-
-- movq mm7, [rounding1_mmx + eax * 8]
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 12 + 16] ; rounding
-
-- mov edi, [esp + 8 + 4] ; dst
-- mov esi, [esp + 8 + 8] ; src
-- mov edx, [esp + 8 + 12] ; stride
-+ movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
-+
-+ mov edi, [esp + 12 + 4] ; dst
-+ mov esi, [esp + 12 + 8] ; src
-+ mov edx, [esp + 12 + 12] ; stride
-
- pxor mm6, mm6 ; zero
-
-@@ -372,6 +392,7 @@ interpolate8x4_halfpel_h_mmx:
- COPY_H_MMX
- COPY_H_MMX
-
-+ pop ebp
- pop edi
- pop esi
-
-@@ -394,13 +415,17 @@ interpolate8x4_halfpel_v_mmx:
- push esi
- push edi
-
-- mov eax, [esp + 8 + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 12 + 16] ; rounding
-
-- movq mm7, [rounding1_mmx + eax * 8]
-+ movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
-
-- mov edi, [esp + 8 + 4] ; dst
-- mov esi, [esp + 8 + 8] ; src
-- mov edx, [esp + 8 + 12] ; stride
-+ mov edi, [esp + 12 + 4] ; dst
-+ mov esi, [esp + 12 + 8] ; src
-+ mov edx, [esp + 12 + 12] ; stride
-
- pxor mm6, mm6 ; zero
-
-@@ -410,6 +435,7 @@ interpolate8x4_halfpel_v_mmx:
- COPY_V_MMX
- COPY_V_MMX
-
-+ pop ebp
- pop edi
- pop esi
-
-@@ -433,24 +459,29 @@ interpolate8x4_halfpel_hv_mmx:
- push esi
- push edi
-
-- mov eax, [esp + 8 + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-
-- movq mm7, [rounding2_mmx + eax * 8]
-+ mov eax, [esp + 12 + 16] ; rounding
-
-- mov edi, [esp + 8 + 4] ; dst
-- mov esi, [esp + 8 + 8] ; src
-+ movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff]
-+
-+ mov edi, [esp + 12 + 4] ; dst
-+ mov esi, [esp + 12 + 8] ; src
-
- mov eax, 8
-
- pxor mm6, mm6 ; zero
-
-- mov edx, [esp + 8 + 12] ; stride
-+ mov edx, [esp + 12 + 12] ; stride
-
- COPY_HV_MMX
- COPY_HV_MMX
- COPY_HV_MMX
- COPY_HV_MMX
-
-+ pop ebp
- pop edi
- pop esi
-
-@@ -491,10 +522,10 @@ interpolate8x4_halfpel_hv_mmx:
-
- por mm3, mm6
-
-- pand mm0, [mmx_mask]
-- pand mm1, [mmx_mask]
-- pand mm4, [mmx_mask]
-- pand mm5, [mmx_mask]
-+ pand mm0, [ebp + mmx_mask wrt ..gotoff]
-+ pand mm1, [ebp + mmx_mask wrt ..gotoff]
-+ pand mm4, [ebp + mmx_mask wrt ..gotoff]
-+ pand mm5, [ebp + mmx_mask wrt ..gotoff]
-
- psrlq mm0, 1 ; src1 / 2
- psrlq mm1, 1 ; src2 / 2
-@@ -538,10 +569,10 @@ interpolate8x4_halfpel_hv_mmx:
-
- pand mm3, mm6
-
-- pand mm0, [mmx_mask]
-- pand mm1, [mmx_mask]
-- pand mm4, [mmx_mask]
-- pand mm5, [mmx_mask]
-+ pand mm0, [ebp + mmx_mask wrt ..gotoff]
-+ pand mm1, [ebp + mmx_mask wrt ..gotoff]
-+ pand mm4, [ebp + mmx_mask wrt ..gotoff]
-+ pand mm5, [ebp + mmx_mask wrt ..gotoff]
-
- psrlq mm0, 1 ; src1 / 2
- psrlq mm1, 1 ; src2 / 2
-@@ -567,21 +598,25 @@ interpolate8x8_avg2_mmx:
-
- push ebx
-
-- mov eax, [esp + 4 + 20] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 8 + 20] ; rounding
- test eax, eax
-
- jnz near .rounding1
-
-- mov eax, [esp + 4 + 24] ; height -> eax
-+ mov eax, [esp + 8 + 24] ; height -> eax
- sub eax, 8
- test eax, eax
-
-- mov ecx, [esp + 4 + 4] ; dst -> edi
-- mov eax, [esp + 4 + 8] ; src1 -> esi
-- mov ebx, [esp + 4 + 12] ; src2 -> eax
-- mov edx, [esp + 4 + 16] ; stride -> edx
-+ mov ecx, [esp + 8 + 4] ; dst -> edi
-+ mov eax, [esp + 8 + 8] ; src1 -> esi
-+ mov ebx, [esp + 8 + 12] ; src2 -> eax
-+ mov edx, [esp + 8 + 16] ; stride -> edx
-
-- movq mm7, [mmx_one]
-+ movq mm7, [ebp + mmx_one wrt ..gotoff]
-
- jz near .start0
-
-@@ -602,16 +637,16 @@ interpolate8x8_avg2_mmx:
- ret
-
- .rounding1
-- mov eax, [esp + 4 + 24] ; height -> eax
-+ mov eax, [esp + 8 + 24] ; height -> eax
- sub eax, 8
- test eax, eax
-
-- mov ecx, [esp + 4 + 4] ; dst -> edi
-- mov eax, [esp + 4 + 8] ; src1 -> esi
-- mov ebx, [esp + 4 + 12] ; src2 -> eax
-- mov edx, [esp + 4 + 16] ; stride -> edx
-+ mov ecx, [esp + 8 + 4] ; dst -> edi
-+ mov eax, [esp + 8 + 8] ; src1 -> esi
-+ mov ebx, [esp + 8 + 12] ; src2 -> eax
-+ mov edx, [esp + 8 + 16] ; stride -> edx
-
-- movq mm7, [mmx_one]
-+ movq mm7, [ebp + mmx_one wrt ..gotoff]
-
- jz near .start1
-
-@@ -628,6 +663,7 @@ interpolate8x8_avg2_mmx:
- lea ecx, [ecx+2*edx]
- AVG2_MMX_RND1
-
-+ pop ebp
- pop ebx
- ret
- .endfunc
-@@ -652,11 +688,11 @@ interpolate8x8_avg2_mmx:
- movq mm2, mm0
- movq mm3, mm1
-
-- pand mm2, [mmx_three]
-- pand mm3, [mmx_three]
-+ pand mm2, [ebp + mmx_three wrt ..gotoff]
-+ pand mm3, [ebp + mmx_three wrt ..gotoff]
-
-- pand mm0, [mmx_mask2]
-- pand mm1, [mmx_mask2]
-+ pand mm0, [ebp + mmx_mask2 wrt ..gotoff]
-+ pand mm1, [ebp + mmx_mask2 wrt ..gotoff]
-
- psrlq mm0, 2
- psrlq mm1, 2
-@@ -673,11 +709,11 @@ interpolate8x8_avg2_mmx:
- movq mm1, mm4
- movq mm3, mm5
-
-- pand mm1, [mmx_three]
-- pand mm3, [mmx_three]
-+ pand mm1, [ebp + mmx_three wrt ..gotoff]
-+ pand mm3, [ebp + mmx_three wrt ..gotoff]
-
-- pand mm4, [mmx_mask2]
-- pand mm5, [mmx_mask2]
-+ pand mm4, [ebp + mmx_mask2 wrt ..gotoff]
-+ pand mm5, [ebp + mmx_mask2 wrt ..gotoff]
-
- psrlq mm4, 2
- psrlq mm5, 2
-@@ -688,8 +724,8 @@ interpolate8x8_avg2_mmx:
- paddb mm1, mm3
- paddb mm2, mm1
-
-- paddb mm2, [mmx_two]
-- pand mm2, [mmx_mask2]
-+ paddb mm2, [ebp + mmx_two wrt ..gotoff]
-+ pand mm2, [ebp + mmx_mask2 wrt ..gotoff]
-
- psrlq mm2, 2
- paddb mm0, mm2
-@@ -707,11 +743,11 @@ interpolate8x8_avg2_mmx:
- movq mm2, mm0
- movq mm3, mm1
-
-- pand mm2, [mmx_three]
-- pand mm3, [mmx_three]
-+ pand mm2, [ebp + mmx_three wrt ..gotoff]
-+ pand mm3, [ebp + mmx_three wrt ..gotoff]
-
-- pand mm0, [mmx_mask2]
-- pand mm1, [mmx_mask2]
-+ pand mm0, [ebp + mmx_mask2 wrt ..gotoff]
-+ pand mm1, [ebp + mmx_mask2 wrt ..gotoff]
-
- psrlq mm0, 2
- psrlq mm1, 2
-@@ -728,11 +764,11 @@ interpolate8x8_avg2_mmx:
- movq mm1, mm4
- movq mm3, mm5
-
-- pand mm1, [mmx_three]
-- pand mm3, [mmx_three]
-+ pand mm1, [ebp + mmx_three wrt ..gotoff]
-+ pand mm3, [ebp + mmx_three wrt ..gotoff]
-
-- pand mm4, [mmx_mask2]
-- pand mm5, [mmx_mask2]
-+ pand mm4, [ebp + mmx_mask2 wrt ..gotoff]
-+ pand mm5, [ebp + mmx_mask2 wrt ..gotoff]
-
- psrlq mm4, 2
- psrlq mm5, 2
-@@ -743,8 +779,8 @@ interpolate8x8_avg2_mmx:
- paddb mm1, mm3
- paddb mm2, mm1
-
-- paddb mm2, [mmx_one]
-- pand mm2, [mmx_mask2]
-+ paddb mm2, [ebp + mmx_one wrt ..gotoff]
-+ pand mm2, [ebp + mmx_mask2 wrt ..gotoff]
-
- psrlq mm2, 2
- paddb mm0, mm2
-@@ -762,18 +798,22 @@ interpolate8x8_avg4_mmx:
- push edi
- push esi
-
-- mov eax, [esp + 12 + 28] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 16 + 28] ; rounding
-
- test eax, eax
-
-- mov ecx, [esp + 12 + 4] ; dst -> edi
-- mov eax, [esp + 12 + 8] ; src1 -> esi
-- mov ebx, [esp + 12 + 12] ; src2 -> eax
-- mov esi, [esp + 12 + 16] ; src3 -> esi
-- mov edi, [esp + 12 + 20] ; src4 -> edi
-- mov edx, [esp + 12 + 24] ; stride -> edx
-+ mov ecx, [esp + 16 + 4] ; dst -> edi
-+ mov eax, [esp + 16 + 8] ; src1 -> esi
-+ mov ebx, [esp + 16 + 12] ; src2 -> eax
-+ mov esi, [esp + 16 + 16] ; src3 -> esi
-+ mov edi, [esp + 16 + 20] ; src4 -> edi
-+ mov edx, [esp + 16 + 24] ; stride -> edx
-
-- movq mm7, [mmx_one]
-+ movq mm7, [ebp + mmx_one wrt ..gotoff]
-
- jnz near .rounding1
-
-@@ -815,6 +855,7 @@ interpolate8x8_avg4_mmx:
- lea ecx, [ecx+edx]
- AVG4_MMX_RND1
-
-+ pop ebp
- pop esi
- pop edi
- pop ebx
-@@ -868,8 +909,8 @@ interpolate8x8_avg4_mmx:
- psubsw mm0, mm2
- psubsw mm1, mm3
-
-- pmullw mm0, [mmx_five]
-- pmullw mm1, [mmx_five]
-+ pmullw mm0, [ebp + mmx_five wrt ..gotoff]
-+ pmullw mm1, [ebp + mmx_five wrt ..gotoff]
-
- movq mm2, [eax-2]
- movq mm4, [eax+3]
-@@ -903,13 +944,17 @@ interpolate8x8_avg4_mmx:
- ALIGN 16
- interpolate8x8_6tap_lowpass_h_mmx:
-
-- mov eax, [esp + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
-+ mov eax, [esp + 20] ; rounding
-
-- movq mm6, [rounding_lowpass_mmx + eax * 8]
-+ movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff]
-
-- mov ecx, [esp + 4] ; dst -> edi
-- mov eax, [esp + 8] ; src -> esi
-- mov edx, [esp + 12] ; stride -> edx
-+ mov ecx, [esp + 8] ; dst -> edi
-+ mov eax, [esp + 12] ; src -> esi
-+ mov edx, [esp + 16] ; stride -> edx
-
- pxor mm7, mm7
-
-@@ -929,6 +974,7 @@ interpolate8x8_6tap_lowpass_h_mmx:
- lea ecx, [ecx+edx]
- LOWPASS_6TAP_H_MMX
-
-+ pop ebp
- ret
- .endfunc
-
-@@ -979,8 +1025,8 @@ interpolate8x8_6tap_lowpass_h_mmx:
- psubsw mm0, mm2
- psubsw mm1, mm3
-
-- pmullw mm0, [mmx_five]
-- pmullw mm1, [mmx_five]
-+ pmullw mm0, [ebp + mmx_five wrt ..gotoff]
-+ pmullw mm1, [ebp + mmx_five wrt ..gotoff]
-
- movq mm2, [eax+edx]
- movq mm4, [eax+2*ebx]
-@@ -1016,13 +1062,17 @@ interpolate8x8_6tap_lowpass_v_mmx:
-
- push ebx
-
-- mov eax, [esp + 4 + 16] ; rounding
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-
-- movq mm6, [rounding_lowpass_mmx + eax * 8]
-+ mov eax, [esp + 8 + 16] ; rounding
-
-- mov ecx, [esp + 4 + 4] ; dst -> edi
-- mov eax, [esp + 4 + 8] ; src -> esi
-- mov edx, [esp + 4 + 12] ; stride -> edx
-+ movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff]
-+
-+ mov ecx, [esp + 8 + 4] ; dst -> edi
-+ mov eax, [esp + 8 + 8] ; src -> esi
-+ mov edx, [esp + 8 + 12] ; stride -> edx
-
- mov ebx, edx
- shl ebx, 1
-@@ -1046,6 +1096,7 @@ interpolate8x8_6tap_lowpass_v_mmx:
- lea ecx, [ecx+edx]
- LOWPASS_6TAP_V_MMX
-
-+ pop ebp
- pop ebx
- ret
- .endfunc
-@@ -1066,12 +1117,17 @@ interpolate8x8_6tap_lowpass_v_mmx:
-
- %macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder
- pxor mm6, mm6
-- movq mm7, [%1] ; TODO: dangerous! (eax isn't checked)
-+ PROLOG0
-+
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+
- %if %2
-- movq mm5, [rounding1_mmx]
-+ movq mm5, [ebp + rounding1_mmx wrt ..gotoff]
- %endif
-
-- PROLOG0
-+ movq mm7, [ebp + %1 wrt ..gotoff] ; TODO: dangerous! (eax isn't checked)
- %endmacro
-
- ; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3)
-@@ -1160,6 +1216,7 @@ interpolate8x8_halfpel_add_mmx:
- ADD_FF_MMX 1
- ADD_FF_MMX 1
- ADD_FF_MMX 0
-+ pop ebp
- ret
- .endfunc
-
-@@ -1206,6 +1263,7 @@ interpolate8x8_halfpel_h_add_mmx:
- ADD_FH_MMX
- lea ecx,[ecx+edx]
- ADD_FH_MMX
-+ pop ebp
- ret
- .endfunc
-
-@@ -1253,6 +1311,7 @@ interpolate8x8_halfpel_v_add_mmx:
- ADD_HF_MMX
- lea ecx,[ecx+edx]
- ADD_HF_MMX
-+ pop ebp
- ret
- .endfunc
-
-@@ -1318,8 +1377,8 @@ interpolate8x8_halfpel_v_add_mmx:
- paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3)
- paddusw mm1, mm5
-
-- paddusw mm0, [rounding1_mmx]
-- paddusw mm1, [rounding1_mmx]
-+ paddusw mm0, [ebp + rounding1_mmx wrt ..gotoff]
-+ paddusw mm1, [ebp + rounding1_mmx wrt ..gotoff]
-
- psrlw mm0, 1
- psrlw mm1, 1
-@@ -1329,6 +1388,11 @@ interpolate8x8_halfpel_v_add_mmx:
- movq [ecx], mm0
- %endmacro
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+ mov ebp, [esp]
-+ retn
-+
- ALIGN 16
- interpolate8x8_halfpel_hv_add_mmx:
- PROLOG rounding2_mmx, 0 ; mm5 is busy. Don't load dst-rounder
-@@ -1364,6 +1428,7 @@ interpolate8x8_halfpel_hv_add_mmx:
- lea ecx,[ecx+edx]
- ADD_HH_MMX
-
-+ pop ebp
- ret
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_xmm.asm xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_xmm.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/interpolate8x8_xmm.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/interpolate8x8_xmm.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -42,20 +42,6 @@ BITS 32
- %endif
- %endmacro
-
--;=============================================================================
--; Read only data
--;=============================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--ALIGN 16
--mmx_one:
-- times 8 db 1
--
- SECTION .text
-
- cglobal interpolate8x8_halfpel_h_xmm
-@@ -132,7 +118,10 @@ interpolate8x8_halfpel_h_xmm:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- COPY_H_SSE_RND1
- lea ecx, [ecx+2*edx]
- COPY_H_SSE_RND1
-@@ -204,7 +193,10 @@ interpolate8x8_halfpel_v_xmm:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- movq mm2, [eax] ; loop invariant
- add eax, edx
-
-@@ -326,7 +318,10 @@ interpolate8x8_halfpel_hv_xmm:
- mov eax, [esp+ 8] ; Src
- mov edx, [esp+12] ; stride
-
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
-
- ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
- movq mm2, [eax]
-@@ -384,7 +379,10 @@ interpolate8x4_halfpel_h_xmm:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- COPY_H_SSE_RND1
- lea ecx, [ecx+2*edx]
- COPY_H_SSE_RND1
-@@ -419,7 +417,10 @@ interpolate8x4_halfpel_v_xmm:
-
- .rounding1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
- movq mm2, [eax] ; loop invariant
- add eax, edx
-
-@@ -458,7 +459,10 @@ interpolate8x4_halfpel_hv_xmm:
- mov eax, [esp+ 8] ; Src
- mov edx, [esp+12] ; stride
-
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
-
- ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
- movq mm2, [eax]
-@@ -583,8 +587,8 @@ interpolate8x8_halfpel_add_xmm: ; 23c
- pxor mm2, mm4
- pavgb mm1, mm3
- pxor mm3, mm5
-- pand mm2, [mmx_one]
-- pand mm3, [mmx_one]
-+ pand mm2, [esp]
-+ pand mm3, [esp]
- psubb mm0, mm2
- psubb mm1, mm3
- pavgb mm0, [ecx+%1]
-@@ -612,6 +616,8 @@ interpolate8x8_halfpel_h_add_xmm: ; 32
- .Loop1
- ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
- ; movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
- ADD_FH_RND1 0, edx
- lea eax,[eax+2*edx]
- lea ecx,[ecx+2*edx]
-@@ -622,6 +628,7 @@ interpolate8x8_halfpel_h_add_xmm: ; 32
- lea eax,[eax+2*edx]
- lea ecx,[ecx+2*edx]
- ADD_FH_RND1 0, edx
-+ add esp, byte 8
- EPILOG
- .endfunc
-
-@@ -686,7 +693,10 @@ interpolate8x8_halfpel_v_add_xmm:
-
- .Loop1
- movq mm0, [eax] ; loop invariant
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-+ add esp, byte 8
-
- ADD_8_HF_RND1
- movq mm0, mm2
-@@ -809,7 +819,9 @@ ALIGN 16
- interpolate8x8_halfpel_hv_add_xmm:
- PROLOG1
-
-- movq mm7, [mmx_one]
-+ push dword 0x01010101
-+ push dword 0x01010101
-+ movq mm7, [esp]
-
- ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
- movq mm2, [eax]
-@@ -838,6 +850,7 @@ interpolate8x8_halfpel_hv_add_xmm:
- add ecx, edx
- ADD_HH_RND1
-
-+ add esp, byte 8
- EPILOG
- .endfunc
-
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_mmx.asm xvidcore-1.1.2/src/image/x86_asm/postprocessing_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/postprocessing_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -70,6 +70,11 @@ mmx_offset:
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.bp:
-+ mov ebp, [esp]
-+ retn
-+
- cglobal image_brightness_mmx
-
-
-@@ -83,16 +88,19 @@ image_brightness_mmx:
- push esi
- push edi
-
-- movq mm6, [mmx_0x80]
--
- mov eax, [esp+8+20] ; offset
-- movq mm7, [mmx_offset + (eax + 128)*8] ; being lazy
--
- mov edx, [esp+8+4] ; Dst
- mov ecx, [esp+8+8] ; stride
- mov esi, [esp+8+12] ; width
- mov edi, [esp+8+16] ; height
-
-+ push ebp
-+ call get_pc.bp
-+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+ movq mm6, [ebp + mmx_0x80 wrt ..gotoff]
-+ movq mm7, [ebp + (eax + 128)*8 + mmx_offset wrt ..gotoff] ; being lazy
-+ pop ebp
-+
- .yloop
- xor eax, eax
-
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_sse2.asm xvidcore-1.1.2/src/image/x86_asm/postprocessing_sse2.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/postprocessing_sse2.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/postprocessing_sse2.asm 2007-01-27 19:50:10.000000000 +0100
-@@ -42,19 +42,6 @@ BITS 32
- %endif
- %endmacro
-
--;===========================================================================
--; read only data
--;===========================================================================
--
--%ifdef FORMAT_COFF
--SECTION .rodata
--%else
--SECTION .rodata align=16
--%endif
--
--xmm_0x80:
-- times 16 db 0x80
--
- ;=============================================================================
- ; Code
- ;=============================================================================
-@@ -69,21 +56,9 @@ cglobal image_brightness_sse2
-
- %macro CREATE_OFFSET_VECTOR 2
- mov [%1 + 0], %2
-- mov [%1 + 1], %2
-- mov [%1 + 2], %2
-- mov [%1 + 3], %2
- mov [%1 + 4], %2
-- mov [%1 + 5], %2
-- mov [%1 + 6], %2
-- mov [%1 + 7], %2
- mov [%1 + 8], %2
-- mov [%1 + 9], %2
-- mov [%1 + 10], %2
-- mov [%1 + 11], %2
- mov [%1 + 12], %2
-- mov [%1 + 13], %2
-- mov [%1 + 14], %2
-- mov [%1 + 15], %2
- %endmacro
-
- ALIGN 16
-@@ -93,15 +68,17 @@ image_brightness_sse2:
- push edi ; 8 bytes offset for push
- sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16)
-
-- movdqa xmm6, [xmm_0x80]
--
- ; Create a offset...offset vector
-- mov eax, [esp+8+32+20] ; brightness offset value
-- mov edx, esp ; edx will be esp aligned mod 16
-- add edx, 15 ; edx = esp + 15
-- and edx, ~15 ; edx = (esp + 15)&(~15)
-- CREATE_OFFSET_VECTOR edx, al
-- movdqa xmm7, [edx]
-+ movzx eax, byte [esp+8+32+20] ; brightness offset value
-+ mov ecx, esp ; ecx will be esp aligned mod 16
-+ mov edx, 0x01010101
-+ add ecx, 15 ; ecx = esp + 15
-+ mul edx
-+ and ecx, ~15 ; ecx = (esp + 15)&(~15)
-+ CREATE_OFFSET_VECTOR ecx, dword 0x80808080
-+ movdqa xmm6, [ecx]
-+ CREATE_OFFSET_VECTOR ecx, eax
-+ movdqa xmm7, [ecx]
-
- mov edx, [esp+8+32+4] ; Dst
- mov ecx, [esp+8+32+8] ; stride
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/qpel_mmx.asm xvidcore-1.1.2/src/image/x86_asm/qpel_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/qpel_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/qpel_mmx.asm 2007-01-27 17:51:30.000000000 +0100
-@@ -201,6 +201,11 @@ FIR_C23: times 4 dw 23
-
- SECTION .text
-
-+extern _GLOBAL_OFFSET_TABLE_
-+get_pc.cx:
-+ mov ecx, [esp]
-+ retn
-+
- ;//////////////////////////////////////////////////////////////////////
- ;// Here we go with the Q-Pel mess.
- ;// For horizontal passes, we process 4 *output* pixel in parallel
-@@ -208,22 +213,25 @@ SECTION .text
- ;//////////////////////////////////////////////////////////////////////
-
- %macro PROLOG_NO_AVRG 0
-+ push ebx
- push esi
- push edi
- push ebp
-- mov edi, [esp+16 + 0*4] ; Dst
-- mov esi, [esp+16 + 1*4] ; Src
-- mov ecx, [esp+16 + 2*4] ; Size
-- mov ebp, [esp+16 + 3*4] ; BpS
-- mov eax, [esp+16 + 4*4] ; Rnd
-+ mov edi, [esp+20 + 0*4] ; Dst
-+ mov esi, [esp+20 + 1*4] ; Src
-+ mov ebp, [esp+20 + 3*4] ; BpS
-+ mov eax, [esp+20 + 4*4] ; Rnd
- and eax, 1
-- movq mm7, [Rounder_QP_MMX+eax*8] ; rounder
-+ call get_pc.cx
-+ add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+ movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff] ; rounder
- %endmacro
-
- %macro EPILOG_NO_AVRG 0
- pop ebp
- pop edi
- pop esi
-+ pop ebx
- ret
- %endmacro
-
-@@ -234,12 +242,13 @@ SECTION .text
- push ebp
- mov edi, [esp+20 + 0*4] ; Dst
- mov esi, [esp+20 + 1*4] ; Src
-- mov ecx, [esp+20 + 2*4] ; Size
- mov ebp, [esp+20 + 3*4] ; BpS
- mov eax, [esp+20 + 4*4] ; Rnd
- and eax, 1
-- movq mm7, [Rounder_QP_MMX+eax*8] ; rounder
-- lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2
-+ call get_pc.cx
-+ add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
-+ movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff] ; rounder
-+ lea ebx, [ecx + Rounder1_MMX+eax*8 wrt ..gotoff] ; *Rounder2
- %endmacro
-
- %macro EPILOG_AVRG 0
-@@ -261,23 +270,23 @@ SECTION .text
- %macro TLOAD 2 ; %1,%2: src pixels
- movzx eax, byte [esi+%1]
- movzx edx, byte [esi+%2]
-- movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ]
-- movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ]
-+ movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff]
-+ movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff]
- paddw mm0, mm7
- paddw mm3, mm7
- %endmacro
-
- %macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs
- movzx eax, byte [esi+%1]
-- paddw %4, [%2 + eax*8]
-- paddw %5, [%3 + eax*8]
-+ paddw %4, [eax*8 + %2]
-+ paddw %5, [eax*8 + %3]
- %endmacro
-
- %macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs
- movzx eax, byte [esi+%1]
-- paddw %5, [%2 + eax*8]
-- paddw %6, [%3 + eax*8]
-- paddw %7, [%4 + eax*8]
-+ paddw %5, [eax*8 + %2]
-+ paddw %6, [eax*8 + %3]
-+ paddw %7, [eax*8 + %4]
- %endmacro
-
- ;//////////////////////////////////////////////////////////////////////
-@@ -287,32 +296,32 @@ SECTION .text
- %macro LOAD 2 ; %1,%2: src pixels
- movzx eax, byte [esi+%1]
- movzx edx, byte [esi+%2]
-- movq mm0, [xvid_Expand_mmx + eax*8]
-- movq mm3, [xvid_Expand_mmx + edx*8]
-- pmullw mm0, [FIR_R0 ]
-- pmullw mm3, [FIR_R16]
-+ movq mm0, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
-+ movq mm3, [ecx + xvid_Expand_mmx + edx*8 wrt ..gotoff]
-+ pmullw mm0, [ecx + FIR_R0 wrt ..gotoff]
-+ pmullw mm3, [ecx + FIR_R16 wrt ..gotoff]
- paddw mm0, mm7
- paddw mm3, mm7
- %endmacro
-
- %macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2
- movzx eax, byte [esi+%1]
-- movq mm4, [xvid_Expand_mmx + eax*8]
-+ movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
- movq mm5, mm4
- pmullw mm4, [%2]
-- pmullw mm5, [%2+8]
-+ pmullw mm5, [8+%2]
- paddw %3, mm4
- paddw %4, mm5
- %endmacro
-
- %macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3
- movzx eax, byte [esi+%1]
-- movq mm4, [xvid_Expand_mmx + eax*8]
-+ movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
- movq mm5, mm4
- movq mm6, mm5
-- pmullw mm4, [%2 ]
-- pmullw mm5, [%2+ 8]
-- pmullw mm6, [%2+16]
-+ pmullw mm4, [ %2]
-+ pmullw mm5, [ 8+%2]
-+ pmullw mm6, [16+%2]
- paddw %3, mm4
- paddw %4, mm5
- paddw %5, mm6
-@@ -359,23 +368,23 @@ SECTION .text
- movq mm1, mm7
- movq mm2, mm7
-
-- ACCUM2 1, FIR_R1, mm0, mm1
-- ACCUM2 2, FIR_R2, mm0, mm1
-- ACCUM2 3, FIR_R3, mm0, mm1
-- ACCUM2 4, FIR_R4, mm0, mm1
--
-- ACCUM3 5, FIR_R5, mm0, mm1, mm2
-- ACCUM3 6, FIR_R6, mm0, mm1, mm2
-- ACCUM3 7, FIR_R7, mm0, mm1, mm2
-- ACCUM2 8, FIR_R8, mm1, mm2
-- ACCUM3 9, FIR_R9, mm1, mm2, mm3
-- ACCUM3 10, FIR_R10,mm1, mm2, mm3
-- ACCUM3 11, FIR_R11,mm1, mm2, mm3
--
-- ACCUM2 12, FIR_R12, mm2, mm3
-- ACCUM2 13, FIR_R13, mm2, mm3
-- ACCUM2 14, FIR_R14, mm2, mm3
-- ACCUM2 15, FIR_R15, mm2, mm3
-+ ACCUM2 1, ecx + FIR_R1 wrt ..gotoff, mm0, mm1
-+ ACCUM2 2, ecx + FIR_R2 wrt ..gotoff, mm0, mm1
-+ ACCUM2 3, ecx + FIR_R3 wrt ..gotoff, mm0, mm1
-+ ACCUM2 4, ecx + FIR_R4 wrt ..gotoff, mm0, mm1
-+
-+ ACCUM3 5, ecx + FIR_R5 wrt ..gotoff, mm0, mm1, mm2
-+ ACCUM3 6, ecx + FIR_R6 wrt ..gotoff, mm0, mm1, mm2
-+ ACCUM3 7, ecx + FIR_R7 wrt ..gotoff, mm0, mm1, mm2
-+ ACCUM2 8, ecx + FIR_R8 wrt ..gotoff, mm1, mm2
-+ ACCUM3 9, ecx + FIR_R9 wrt ..gotoff, mm1, mm2, mm3
-+ ACCUM3 10, ecx + FIR_R10 wrt ..gotoff,mm1, mm2, mm3
-+ ACCUM3 11, ecx + FIR_R11 wrt ..gotoff,mm1, mm2, mm3
-+
-+ ACCUM2 12, ecx + FIR_R12 wrt ..gotoff, mm2, mm3
-+ ACCUM2 13, ecx + FIR_R13 wrt ..gotoff, mm2, mm3
-+ ACCUM2 14, ecx + FIR_R14 wrt ..gotoff, mm2, mm3
-+ ACCUM2 15, ecx + FIR_R15 wrt ..gotoff, mm2, mm3
-
- %else
-
-@@ -383,25 +392,25 @@ SECTION .text
- movq mm1, mm7
- movq mm2, mm7
-
-- TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1
-- TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1
-- TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1
-- TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1
--
-- TACCUM3 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0 , mm0, mm1, mm2
-- TACCUM3 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1, mm2
-- TACCUM3 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1, mm2
--
-- TACCUM2 8, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm1, mm2
--
-- TACCUM3 9, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0, mm1, mm2, mm3
-- TACCUM3 10, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0, mm1, mm2, mm3
-- TACCUM3 11, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0, mm1, mm2, mm3
--
-- TACCUM2 12, xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3
-- TACCUM2 13, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3
-- TACCUM2 14, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3
-- TACCUM2 15, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3
-+ TACCUM2 1, ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1
-+ TACCUM2 2, ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1
-+ TACCUM2 3, ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1
-+ TACCUM2 4, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff, mm0, mm1
-+
-+ TACCUM3 5, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1, mm2
-+ TACCUM3 6, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1, mm2
-+ TACCUM3 7, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1, mm2
-+
-+ TACCUM2 8, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , xvid_FIR_20_6_3_1 wrt ..gotoff , mm1, mm2
-+
-+ TACCUM3 9, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff, mm1, mm2, mm3
-+ TACCUM3 10, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff, mm1, mm2, mm3
-+ TACCUM3 11, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff, mm1, mm2, mm3
-+
-+ TACCUM2 12, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff, ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm2, mm3
-+ TACCUM2 13, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm2, mm3
-+ TACCUM2 14, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm2, mm3
-+ TACCUM2 15, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm2, mm3
-
- %endif
-
-@@ -418,7 +427,7 @@ SECTION .text
- MIX mm0, esi+1, ebx
- %endif
- %if (%2==1)
-- MIX mm0, edi, Rounder1_MMX
-+ MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff
- %endif
-
- %if (%1==1)
-@@ -427,7 +436,7 @@ SECTION .text
- MIX mm2, esi+9, ebx
- %endif
- %if (%2==1)
-- MIX mm2, edi+8, Rounder1_MMX
-+ MIX mm2, edi+8, ecx + Rounder1_MMX wrt ..gotoff
- %endif
-
- lea esi, [esi+ebp]
-@@ -436,7 +445,7 @@ SECTION .text
- movq [edi+8], mm2
-
- add edi, ebp
-- dec ecx
-+ dec dword [esp+20 + 2*4]
- jg .Loop
-
- %if (%2==0) && (%1==0)
-@@ -464,64 +473,64 @@ SECTION .text
- %ifndef USE_TABLES
-
- LOAD 0, 8 ; special case for 1rst/last pixel
-- ACCUM2 1, FIR_R1, mm0, mm3
-- ACCUM2 2, FIR_R2, mm0, mm3
-- ACCUM2 3, FIR_R3, mm0, mm3
-- ACCUM2 4, FIR_R4, mm0, mm3
--
-- ACCUM2 5, FIR_R13, mm0, mm3
-- ACCUM2 6, FIR_R14, mm0, mm3
-- ACCUM2 7, FIR_R15, mm0, mm3
-+ ACCUM2 1, ecx + FIR_R1 wrt ..gotoff, mm0, mm3
-+ ACCUM2 2, ecx + FIR_R2 wrt ..gotoff, mm0, mm3
-+ ACCUM2 3, ecx + FIR_R3 wrt ..gotoff, mm0, mm3
-+ ACCUM2 4, ecx + FIR_R4 wrt ..gotoff, mm0, mm3
-+
-+ ACCUM2 5, ecx + FIR_R13 wrt ..gotoff, mm0, mm3
-+ ACCUM2 6, ecx + FIR_R14 wrt ..gotoff, mm0, mm3
-+ ACCUM2 7, ecx + FIR_R15 wrt ..gotoff, mm0, mm3
-
- %else
-
- %if 0 ; test with no unrolling
-
- TLOAD 0, 8 ; special case for 1rst/last pixel
-- TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm3
-- TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm3
-- TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm3
-- TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3
-- TACCUM2 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm0, mm3
-- TACCUM2 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm0, mm3
-- TACCUM2 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm0, mm3
-+ TACCUM2 1, ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm3
-+ TACCUM2 2, ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm3
-+ TACCUM2 3, ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm3
-+ TACCUM2 4, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm0, mm3
-+ TACCUM2 5, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm0, mm3
-+ TACCUM2 6, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm0, mm3
-+ TACCUM2 7, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm0, mm3
-
- %else ; test with unrolling (little faster, but not much)
-
- movzx eax, byte [esi]
- movzx edx, byte [esi+8]
-- movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ]
-+ movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff]
- movzx eax, byte [esi+1]
-- movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ]
-+ movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff]
- paddw mm0, mm7
- paddw mm3, mm7
-
- movzx edx, byte [esi+2]
-- paddw mm0, [xvid_FIR_23_19_6_3 + eax*8]
-- paddw mm3, [xvid_FIR_1_0_0_0 + eax*8]
-+ paddw mm0, [ecx + xvid_FIR_23_19_6_3 + eax*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_1_0_0_0 + eax*8 wrt ..gotoff]
-
- movzx eax, byte [esi+3]
-- paddw mm0, [xvid_FIR_7_20_20_6 + edx*8]
-- paddw mm3, [xvid_FIR_3_1_0_0 + edx*8]
-+ paddw mm0, [ecx + xvid_FIR_7_20_20_6 + edx*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_3_1_0_0 + edx*8 wrt ..gotoff]
-
- movzx edx, byte [esi+4]
-- paddw mm0, [xvid_FIR_3_6_20_20 + eax*8]
-- paddw mm3, [xvid_FIR_6_3_1_0 + eax*8]
-+ paddw mm0, [ecx + xvid_FIR_3_6_20_20 + eax*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_6_3_1_0 + eax*8 wrt ..gotoff]
-
- movzx eax, byte [esi+5]
-- paddw mm0, [xvid_FIR_1_3_6_20 + edx*8]
-- paddw mm3, [xvid_FIR_20_6_3_1 + edx*8]
-+ paddw mm0, [ecx + xvid_FIR_1_3_6_20 + edx*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_20_6_3_1 + edx*8 wrt ..gotoff]
-
- movzx edx, byte [esi+6]
-- paddw mm0, [xvid_FIR_0_1_3_6 + eax*8]
-- paddw mm3, [xvid_FIR_20_20_6_3 + eax*8]
-+ paddw mm0, [ecx + xvid_FIR_0_1_3_6 + eax*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_20_20_6_3 + eax*8 wrt ..gotoff]
-
- movzx eax, byte [esi+7]
-- paddw mm0, [xvid_FIR_0_0_1_3 + edx*8]
-- paddw mm3, [xvid_FIR_6_20_20_7 + edx*8]
-+ paddw mm0, [ecx + xvid_FIR_0_0_1_3 + edx*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_6_20_20_7 + edx*8 wrt ..gotoff]
-
-- paddw mm0, [xvid_FIR_0_0_0_1 + eax*8]
-- paddw mm3, [xvid_FIR_3_6_19_23 + eax*8]
-+ paddw mm0, [ecx + xvid_FIR_0_0_0_1 + eax*8 wrt ..gotoff]
-+ paddw mm3, [ecx + xvid_FIR_3_6_19_23 + eax*8 wrt ..gotoff]
-
- %endif
-
-@@ -537,14 +546,14 @@ SECTION .text
- MIX mm0, esi+1, ebx
- %endif
- %if (%2==1)
-- MIX mm0, edi, Rounder1_MMX
-+ MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff
- %endif
-
- movq [edi], mm0
-
- add edi, ebp
- add esi, ebp
-- dec ecx
-+ dec dword [esp+20 + 2*4]
- jg .Loop
-
- %if (%2==0) && (%1==0)
-@@ -678,7 +687,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- V_MIX %3, esi, ebx
- %endif
- %if (%2==1)
-- V_MIX %3, edi, Rounder1_MMX
-+ V_MIX %3, edi, ecx + Rounder1_MMX wrt ..gotoff
- %endif
-
- movd eax, %3
-@@ -718,28 +727,28 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- movq mm3, mm7
-
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_STORE %1, %2, mm0, 0
-
- V_LOAD 0
-- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3
-- V_ACC1 mm3, FIR_Cm6
-+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
- V_STORE %1, %2, mm1, 0
-
- V_LOAD 0
-- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_STORE %1, %2, mm2, 0
-
- V_LOAD 1
-- V_ACC1 mm3, FIR_Cm1
-+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
- V_STORE %1, %2, mm3, 0
-
- ; ouput rows [4..7], from input rows [1..11] (!!)
-@@ -756,38 +765,38 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- movq mm3, mm7
-
- V_LOAD 0
-- V_ACC1 mm0, FIR_Cm1
-+ V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1
-+ V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3
-- V_ACC1 mm2, FIR_Cm1
-+ V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_STORE %1, %2, mm0, 0
-
- V_LOAD 0
-- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3
-- V_ACC1 mm3, FIR_Cm6
-+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
- V_STORE %1, %2, mm1, 0
-
- V_LOAD 0
-- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_STORE %1, %2, mm2, 0
-
- V_LOAD 1
-- V_ACC1 mm3, FIR_Cm1
-+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
- V_STORE %1, %2, mm3, 0
-
- ; ouput rows [8..11], from input rows [5..15]
-@@ -804,39 +813,39 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- movq mm3, mm7
-
- V_LOAD 0
-- V_ACC1 mm0, FIR_Cm1
-+ V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1
-+ V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3
-- V_ACC1 mm2, FIR_Cm1
-+ V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
-
- V_STORE %1, %2, mm0, 0
-
- V_LOAD 0
-- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3
-- V_ACC1 mm3, FIR_Cm6
-+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
- V_STORE %1, %2, mm1, 0
-
- V_LOAD 0
-- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_STORE %1, %2, mm2, 0
-
- V_LOAD 1
-- V_ACC1 mm3, FIR_Cm1
-+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
- V_STORE %1, %2, mm3, 0
-
-
-@@ -855,25 +864,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- movq mm3, mm7
-
- V_LOAD 0
-- V_ACC1 mm3, FIR_Cm1
-+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3
-- V_ACC1 mm3, FIR_Cm6
-+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_LOAD 1
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
-
- V_STORE %1, %2, mm3, 0
- V_STORE %1, %2, mm2, 0
-@@ -886,7 +895,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- pop edi
- add esi, 4
- add edi, 4
-- sub ecx, 4
-+ sub dword [esp+20 + 2*4], 4
- jg .Loop
-
- %if (%2==0) && (%1==0)
-@@ -924,29 +933,29 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- movq mm3, mm7
-
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_STORE %1, %2, mm0, 0
-
- V_LOAD 0
-- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3
-- V_ACC1 mm3, FIR_Cm6
-+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
-
- V_STORE %1, %2, mm1, 0
-
- V_LOAD 0
-- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_STORE %1, %2, mm2, 0
-
- V_LOAD 1
-- V_ACC1 mm3, FIR_Cm1
-+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
- V_STORE %1, %2, mm3, 0
-
- ; ouput rows [4..7], from input rows [1..9]
-@@ -964,25 +973,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- movq mm3, mm7
-
- V_LOAD 0
-- V_ACC1 mm3, FIR_Cm1
-+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
-+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3
-- V_ACC1 mm3, FIR_Cm6
-+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
-+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
-
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
- V_LOAD 0
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
- V_LOAD 1
-- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
-+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
-
- V_STORE %1, %2, mm3, 0
- V_STORE %1, %2, mm2, 0
-@@ -995,7 +1004,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx:
- pop edi
- add esi, 4
- add edi, 4
-- sub ecx, 4
-+ sub dword [esp+20 + 2*4], 4
- jg .Loop
-
- %if (%2==0) && (%1==0)
-diff -urp xvidcore-1.1.2-old/src/image/x86_asm/reduced_mmx.asm xvidcore-1.1.2/src/image/x86_asm/reduced_mmx.asm
---- xvidcore-1.1.2-old/src/image/x86_asm/reduced_mmx.asm 2007-01-27 19:43:48.000000000 +0100
-+++ xvidcore-1.1.2/src/image/x86_asm/reduced_mmx.asm 2007-01-27 13:33:30.000000000 +0100
-@@ -91,8 +91,8 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm
- pmullw mm4, %4 ; [Up31]
- pmullw %2, %3 ; [Up13]
- pmullw mm5, %4 ; [Up31]
-- paddsw %1, [Cst2]
-- paddsw %2, [Cst2]
-+ paddsw %1, [ebp + Cst2 wrt ..gotoff]
-+ paddsw %2, [ebp + Cst2 wrt ..gotoff]
- paddsw %1, mm4
- paddsw %2, mm5
- %endmacro
-@@ -126,14 +126,14 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm
-
- %macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output
- ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved.
-- movq mm4, [Cst3]
-- movq mm5, [Cst3]
-+ movq mm4, [ebp + Cst3 wrt ..gotoff]
-+ movq mm5, [ebp + Cst3 wrt ..gotoff]
- pmullw mm4, %3
- pmullw mm5, %4
- paddsw mm4, %1
- paddsw mm5, %2
-- pmullw %1, [Cst3]
-- pmullw %2, [Cst3]
-+ pmullw %1, [ebp + Cst3 wrt ..gotoff]
-+ pmullw %2, [ebp + Cst3 wrt ..gotoff]
- paddsw %1, %3
- paddsw %2, %4
- %endmacro
<Skipped 1787 lines>
================================================================
---- gitweb:
http://git.pld-linux.org/gitweb.cgi/packages/xvid.git/commitdiff/f24cf361b3f55380b384d2dbce4e74092da8bfa0
More information about the pld-cvs-commit
mailing list