SOURCES: xvid-1.1.2-textrel.patch (NEW) - TEXTREL fix from gentoo:...

Tue May 8 23:35:45 CEST 2007

Author: radek                        Date: Tue May  8 21:35:45 2007 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- TEXTREL fix from gentoo: http://bugs.gentoo.org/show_bug.cgi?id=135326

---- Files affected:
SOURCES:
   xvid-1.1.2-textrel.patch (NONE -> 1.1)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/xvid-1.1.2-textrel.patch
diff -u /dev/null SOURCES/xvid-1.1.2-textrel.patch:1.1

--- /dev/null	Tue May  8 23:35:45 2007
+++ SOURCES/xvid-1.1.2-textrel.patch	Tue May  8 23:35:40 2007
@@ -0,0 +1,5757 @@
+diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm
+--- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm	2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm	2007-01-27 13:33:30.000000000 +0100
+@@ -50,23 +50,6 @@ BITS 32
+ %endmacro
+ 
+ ;=============================================================================
+-; Local data
+-;=============================================================================
+-
+-%ifdef FORMAT_COFF
+-SECTION .rodata
+-%else
+-SECTION .rodata align=16
+-%endif
+-
+-ALIGN 16
+-
+-mult_mask:
+-  db 0x10,0x20,0x04,0x08,0x01,0x02,0x00,0x00
+-ignore_dc:
+-  dw 0, -1, -1, -1
+-
+-;=============================================================================
+ ; Code
+ ;=============================================================================
+ 
+@@ -91,7 +74,12 @@ ALIGN 16
+ calc_cbp_mmx:
+   mov eax, [esp + 4]            ; coeff
+ 
+-  movq mm7, [ignore_dc]
++  push byte 0                 ; align esp to 8 bytes
++  push byte -1
++  push dword 0xFFFF0000
++  movq mm7, [esp]
++  add esp, byte 8
++
+   pxor mm6, mm6                ; used only for comparing
+   movq mm0, [eax+128*0]
+   movq mm1, [eax+128*1]
+@@ -123,7 +111,11 @@ calc_cbp_mmx:
+   MAKE_LOAD 13
+   MAKE_LOAD 14
+ 
+-  movq mm7, [mult_mask]
++  push dword 0x00000201
++  push dword 0x08042010
++  movq mm7, [esp]
++  add esp, byte 12
++
+   packssdw mm0, mm1
+   packssdw mm2, mm3
+   packssdw mm4, mm5
+diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm
+--- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm	2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm	2007-01-27 13:33:30.000000000 +0100
+@@ -69,20 +69,6 @@ BITS 32
+ %endmacro
+ 
+ ;=============================================================================
+-; Data (Read Only)
+-;=============================================================================
+-
+-%ifdef FORMAT_COFF
+-SECTION .rodata
+-%else
+-SECTION .rodata align=16
+-%endif
+-
+-ALIGN 16
+-ignore_dc:
+-  dw 0, -1, -1, -1, -1, -1, -1, -1
+-
+-;=============================================================================
+ ; Code
+ ;=============================================================================
+ 
+@@ -98,7 +84,13 @@ calc_cbp_sse2:
+   mov edx, [esp+4]         ; coeff[]
+   xor eax, eax             ; cbp = 0
+ 
+-  movdqu xmm7, [ignore_dc] ; mask to ignore dc value
++  sub esp,byte 12          ; align esp to 16 bytes
++  push byte -1
++  push byte -1
++  push byte -1
++  push dword 0xFFFF0000
++  movdqu xmm7, [esp]       ; mask to ignore dc value
++  add esp, byte 28
+   pxor xmm6, xmm6          ; zero
+ 
+   LOOP_SSE2 0
+diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm
+--- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm	2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm	2007-01-27 13:33:30.000000000 +0100
+@@ -204,7 +204,7 @@ fdct_r_row:
+   psllw mm4, SHIFT_FRW_COL
+   movq mm6, mm0
+   psubsw mm2, mm1
+-  movq mm1, [fdct_tg_all_16 + 4*2]
++  movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
+   psubsw mm0, mm4
+   movq mm7, [%2 + %3*2 + 3*16]
+   pmulhw mm1, mm0
+@@ -216,9 +216,9 @@ fdct_r_row:
+   psubsw mm5, mm7
+   paddsw mm1, mm5
+   paddsw mm4, mm7
+-  por mm1, [fdct_one_corr]
++  por mm1, [ebx + fdct_one_corr wrt ..gotoff]
+   psllw mm2, SHIFT_FRW_COL + 1
+-  pmulhw mm5, [fdct_tg_all_16 + 4*2]
++  pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
+   movq mm7, mm4
+   psubsw mm3, [%2 + %3*2 + 5*16]
+   psubsw mm4, mm6
+@@ -230,34 +230,34 @@ fdct_r_row:
+   movq mm6, mm2
+   movq [%1 + %3*2 + 4*16], mm4
+   paddsw mm2, mm3
+-  pmulhw mm2, [ocos_4_16]
++  pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff]
+   psubsw mm6, mm3
+-  pmulhw mm6, [ocos_4_16]
++  pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff]
+   psubsw mm5, mm0
+-  por mm5, [fdct_one_corr]
++  por mm5, [ebx + fdct_one_corr wrt ..gotoff]
+   psllw mm1, SHIFT_FRW_COL
+-  por mm2, [fdct_one_corr]
++  por mm2, [ebx + fdct_one_corr wrt ..gotoff]
+   movq mm4, mm1
+   movq mm3, [%2 + %3*2 + 0*16]
+   paddsw mm1, mm6
+   psubsw mm3, [%2 + %3*2 + 7*16]
+   psubsw mm4, mm6
+-  movq mm0, [fdct_tg_all_16 + 0*2]
++  movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
+   psllw mm3, SHIFT_FRW_COL
+-  movq mm6, [fdct_tg_all_16 + 8*2]
++  movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
+   pmulhw mm0, mm1
+   movq [%1 + %3*2 + 0*16], mm7
+   pmulhw mm6, mm4
+   movq [%1 + %3*2 + 6*16], mm5
+   movq mm7, mm3
+-  movq mm5, [fdct_tg_all_16 + 8*2]
++  movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
+   psubsw mm7, mm2
+   paddsw mm3, mm2
+   pmulhw mm5, mm7
+   paddsw mm0, mm3
+   paddsw mm6, mm4
+-  pmulhw mm3, [fdct_tg_all_16 + 0*2]
+-  por mm0, [fdct_one_corr]
++  pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
++  por mm0, [ebx + fdct_one_corr wrt ..gotoff]
+   paddsw mm5, mm7
+   psubsw mm7, mm6
+   movq [%1 + %3*2 + 1*16], mm0
+@@ -287,28 +287,28 @@ fdct_r_row:
+   movq mm6, mm5
+   punpckldq mm3, mm5
+   punpckhdq mm6, mm3
+-  movq mm3, [%3 + 0*2]
+-  movq mm4, [%3 + 4*2]
++  movq mm3, [0*2 + %3]
++  movq mm4, [4*2 + %3]
+   punpckldq mm2, mm0
+   pmaddwd mm3, mm0
+   punpckhdq mm1, mm2
+-  movq mm2, [%3 + 16*2]
++  movq mm2, [16*2 + %3]
+   pmaddwd mm4, mm1
+-  pmaddwd mm0, [%3 + 8*2]
+-  movq mm7, [%3 + 20*2]
++  pmaddwd mm0, [8*2 + %3]
++  movq mm7, [20*2 + %3]
+   pmaddwd mm2, mm5
+-  paddd mm3, [fdct_r_row]
++  paddd mm3, [ebx + fdct_r_row wrt ..gotoff]
+   pmaddwd mm7, mm6
+-  pmaddwd mm1, [%3 + 12*2]
++  pmaddwd mm1, [12*2 + %3]
+   paddd mm3, mm4
+-  pmaddwd mm5, [%3 + 24*2]
+-  pmaddwd mm6, [%3 + 28*2]
++  pmaddwd mm5, [24*2 + %3]
++  pmaddwd mm6, [28*2 + %3]
+   paddd mm2, mm7
+-  paddd mm0, [fdct_r_row]
++  paddd mm0, [ebx + fdct_r_row wrt ..gotoff]
+   psrad mm3, SHIFT_FRW_ROW
+-  paddd mm2, [fdct_r_row]
++  paddd mm2, [ebx + fdct_r_row wrt ..gotoff]
+   paddd mm0, mm1
+-  paddd mm5, [fdct_r_row]
++  paddd mm5, [ebx + fdct_r_row wrt ..gotoff]
+   psrad mm2, SHIFT_FRW_ROW
+   paddd mm5, mm6
+   psrad mm0, SHIFT_FRW_ROW
+@@ -336,23 +336,23 @@ fdct_r_row:
+   psubsw mm1, mm5
+   pshufw mm2, mm0, 0x4E
+   pshufw mm3, mm1, 0x4E
+-  movq mm4, [%3 +  0*2]
+-  movq mm6, [%3 +  4*2]
+-  movq mm5, [%3 + 16*2]
+-  movq mm7, [%3 + 20*2]
++  movq mm4, [ 0*2 + %3]
++  movq mm6, [ 4*2 + %3]
++  movq mm5, [16*2 + %3]
++  movq mm7, [20*2 + %3]
+   pmaddwd mm4, mm0
+   pmaddwd mm5, mm1
+   pmaddwd mm6, mm2
+   pmaddwd mm7, mm3
+-  pmaddwd mm0, [%3 +  8*2]
+-  pmaddwd mm2, [%3 + 12*2]
+-  pmaddwd mm1, [%3 + 24*2]
+-  pmaddwd mm3, [%3 + 28*2]
++  pmaddwd mm0, [ 8*2 + %3]
++  pmaddwd mm2, [12*2 + %3]
++  pmaddwd mm1, [24*2 + %3]
++  pmaddwd mm3, [28*2 + %3]
+   paddd mm4, mm6
+   paddd mm5, mm7
+   paddd mm0, mm2
+   paddd mm1, mm3
+-  movq mm7, [fdct_r_row]
++  movq mm7, [ebx + fdct_r_row wrt ..gotoff]
+   paddd mm4, mm7
+   paddd mm5, mm7
+   paddd mm0, mm7
+@@ -377,6 +377,10 @@ cglobal %1
+ 	;; Move the destination/source address to the eax register
+   mov eax, [esp + 4]
+ 
++  push ebx
++  call get_pc.bx
++  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
++
+ 	;; Process the columns (4 at a time)
+   FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
+   FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
+@@ -386,12 +390,12 @@ cglobal %1
+ %assign i 0
+ %rep 8
+ 	;; Process the 'i'th row
+-  %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
++  %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff
+ 	%assign i i+1
+ %endrep
+ %else
+   mov ecx, 8
+-  mov edx, tab_frw_01234567
++  mov edx, [ebx + tab_frw_01234567 wrt ..gotoff]
+ ALIGN 8
+ .loop
+   %2 eax, eax, edx
+@@ -401,6 +405,7 @@ ALIGN 8
+   jne .loop
+ %endif
+ 
++  pop ebx
+   ret
+ .endfunc
+ %endmacro
+@@ -411,6 +416,11 @@ ALIGN 8
+ 
+ SECTION .text
+ 
++extern  _GLOBAL_OFFSET_TABLE_
++get_pc.bx:
++  mov ebx, [esp]
++  retn
++
+ ;-----------------------------------------------------------------------------
+ ; void fdct_mmx_ffmpeg(int16_t block[64]);
+ ;-----------------------------------------------------------------------------
+diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm
+--- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm	2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm	2007-01-27 13:33:30.000000000 +0100
+@@ -294,15 +294,15 @@ MMX_One:
+   paddsw mm2, mm1       ; mm2: t6+t5
+   movq [%1+0*16], mm5   ; => out0
+ 
+-  movq mm4, [tan2]      ; mm4 <= tan2
++  movq mm4, [ebx + tan2 wrt ..gotoff]      ; mm4 <= tan2
+   pmulhw mm4, mm7       ; tm03*tan2
+-  movq mm5, [tan2]      ; mm5 <= tan2
++  movq mm5, [ebx + tan2 wrt ..gotoff]      ; mm5 <= tan2
+   psubsw mm4, mm6       ; out6 = tm03*tan2 - tm12
+   pmulhw mm5, mm6       ; tm12*tan2
+   paddsw mm5, mm7       ; out2 = tm12*tan2 + tm03
+ 
+-  movq mm6, [sqrt2]
+-  movq mm7, [MMX_One]
++  movq mm6, [ebx + sqrt2 wrt ..gotoff]
++  movq mm7, [ebx + MMX_One wrt ..gotoff]
+ 
+   pmulhw mm2, mm6       ; mm2: tp65 = (t6 + t5)*cos4
+   por mm5, mm7          ; correct out2
+@@ -320,8 +320,8 @@ MMX_One:
+   paddsw mm2, mm4       ; mm2: tp765 = t7 + tp65
+   paddsw mm1, mm5       ; mm1: tp465 = t4 + tm65
+ 
+-  movq mm4, [tan3]      ; tan3 - 1
+-  movq mm5, [tan1]      ; tan1
++  movq mm4, [ebx + tan3 wrt ..gotoff]      ; tan3 - 1
++  movq mm5, [ebx + tan1 wrt ..gotoff]      ; tan1
+ 
+   movq mm7, mm3         ; save tm465
+   pmulhw mm3, mm4       ; tm465*(tan3-1)
+@@ -364,23 +364,23 @@ MMX_One:
+   punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
+   punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
+ 
+-  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]
+-  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]
++  movq mm2, qword [0 + %3]     ;  [   M00    M01      M16    M17]
++  movq mm3, qword [8 + %3]     ;  [   M02    M03      M18    M19]
+   pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
+-  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]
++  movq mm4, qword [16 + %3]    ;  [   M04    M05      M20    M21]
+   pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
+-  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]
++  movq mm5, qword [24 + %3]    ;  [   M06    M07      M22    M23]
+   pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
+-  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]
++  movq mm6, qword [32 + %3]    ;  [   M08    M09      M24    M25]
+   pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
+-  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]
++  movq mm7, qword [40 + %3]    ;  [   M10    M11      M26    M27]
+   pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
+   paddd mm2, mm3               ;  [ out0 | out1 ]
+   pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
+   psrad mm2, 16
+-  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
++  pmaddwd mm0, qword [48 + %3] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
+   paddd mm4, mm5               ;  [ out2 | out3 ]
+-  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
++  pmaddwd mm1, qword [56 + %3] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
+   psrad mm4, 16
+ 
+   paddd mm6, mm7               ;  [ out4 | out5 ]
+@@ -422,23 +422,23 @@ MMX_One:
+   punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
+   punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
+ 
+-  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]
+-  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]
++  movq mm2, qword [0 + %3]     ;  [   M00    M01      M16    M17]
++  movq mm3, qword [8 + %3]     ;  [   M02    M03      M18    M19]
+   pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
+-  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]
++  movq mm4, qword [16 + %3]    ;  [   M04    M05      M20    M21]
+   pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
+-  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]
++  movq mm5, qword [24 + %3]    ;  [   M06    M07      M22    M23]
+   pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
+-  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]
++  movq mm6, qword [32 + %3]    ;  [   M08    M09      M24    M25]
+   pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
+-  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]
++  movq mm7, qword [40 + %3]    ;  [   M10    M11      M26    M27]
+   pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
+   paddd mm2, mm3               ;  [ out0 | out1 ]
+   pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
+   psrad mm2, 16
+-  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
++  pmaddwd mm0, qword [48 + %3] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
+   paddd mm4, mm5               ;  [ out2 | out3 ]
+-  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
++  pmaddwd mm1, qword [56 + %3] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
+   psrad mm4, 16
+ 
+   paddd mm6, mm7               ;  [ out4 | out5 ]
+@@ -467,12 +467,16 @@ MMX_One:
+ ALIGN 16
+ cglobal %1
+ %1:
++  push ebx
++  call get_pc.bx
++  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
++
+ %ifdef UNROLLED_LOOP
+-  mov ecx, [esp + 4]
++  mov ecx, [esp + 4 + 4]
+ %else
+-  push ebx
++  push esi
+   push edi
+-  mov ecx, [esp + 8 + 4]
++  mov ecx, [esp + 12 + 4]
+ %endif
+ 
+   fLLM_PASS ecx+0, ecx+0, 3
+@@ -481,27 +485,28 @@ cglobal %1
+ %ifdef UNROLLED_LOOP
+ %assign i 0
+ %rep 8
+-  %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8
++  %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff
+ 	%assign i i+1
+ %endrep
+ %else
+   mov eax, 8
+-  mov edx, fdct_table
+-  mov ebx, fdct_rounding_1
+-  mov edi, fdct_rounding_2
++  lea edx, [ebx + fdct_table wrt ..gotoff]
++  lea esi, [ebx + fdct_rounding_1 wrt ..gotoff]
++  lea edi, [ebx + fdct_rounding_2 wrt ..gotoff]
+ .loop
+-  %2 ecx, ecx, edx, ebx, edi
++  %2 ecx, ecx, edx, esi, edi
+   add ecx, 2*8
+   add edx, 2*32
+-  add ebx, 2*4
++  add esi, 2*4
+   add edi, 2*4
+   dec eax
+   jne .loop
+ 
+   pop edi
+-  pop ebx
++  pop esi
+ %endif
+ 
++  pop ebx
+   ret
+ .endfunc
+ %endmacro
+@@ -512,6 +517,11 @@ cglobal %1
+ 
+ SECTION .text
+ 
++extern  _GLOBAL_OFFSET_TABLE_
++get_pc.bx:
++  mov ebx, [esp]
++  retn
++
+ ;-----------------------------------------------------------------------------
+ ; void fdct_mmx_skal(int16_t block[64]];
+ ;-----------------------------------------------------------------------------
+diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm
+--- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm	2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm	2007-01-27 13:33:30.000000000 +0100
+@@ -238,10 +238,10 @@ cglobal fdct_sse2_skal
+   pshufd  xmm6, xmm0, 01010101b ; [13131313]
+   pshufd  xmm7, xmm0, 11111111b ; [57575757]
+ 
+-  pmaddwd xmm4, [%2+ 0]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
+-  pmaddwd xmm5, [%2+16]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
+-  pmaddwd xmm6, [%2+32]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
+-  pmaddwd xmm7, [%2+48]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
++  pmaddwd xmm4, [ 0 + %2]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
++  pmaddwd xmm5, [16 + %2]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
++  pmaddwd xmm6, [32 + %2]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
++  pmaddwd xmm7, [48 + %2]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
+   paddd   xmm4, [%3]      ; Round
+ 
+   paddd   xmm6, xmm7      ; [b0|b1|b2|b3]
+@@ -267,12 +267,12 @@ cglobal fdct_sse2_skal
+ 
+ %macro iLLM_PASS 1  ; %1: src/dst
+ 
+-  movdqa xmm0, [tan3]     ; t3-1
++  movdqa xmm0, [ebx + tan3 wrt ..gotoff]     ; t3-1
+   movdqa xmm3, [%1+16*3]  ; x3
+   movdqa xmm1, xmm0       ; t3-1
+   movdqa xmm5, [%1+16*5]  ; x5
+ 
+-  movdqa xmm4, [tan1]     ; t1
++  movdqa xmm4, [ebx + tan1 wrt ..gotoff]     ; t1
+   movdqa xmm6, [%1+16*1]  ; x1
+   movdqa xmm7, [%1+16*7]  ; x7
+   movdqa xmm2, xmm4       ; t1
+@@ -290,7 +290,7 @@ cglobal fdct_sse2_skal
+   psubsw xmm2, xmm7       ; x1*t1-x7 = tm17
+ 
+ 
+-  movdqa xmm3, [sqrt2]
++  movdqa xmm3, [ebx + sqrt2 wrt ..gotoff]
+   movdqa xmm7, xmm4
+   movdqa xmm6, xmm2
+   psubsw xmm4, xmm1       ; tp17-tp35 = t1
+@@ -310,7 +310,7 @@ cglobal fdct_sse2_skal
+   paddsw xmm0, xmm0       ; 2.(t1+t2) = b1
+   paddsw xmm4, xmm4       ; 2.(t1-t2) = b2
+ 
+-  movdqa xmm7, [tan2]     ; t2
++  movdqa xmm7, [ebx + tan2 wrt ..gotoff]     ; t2
+   movdqa xmm3, [%1+2*16]  ; x2
+   movdqa xmm6, [%1+6*16]  ; x6
+   movdqa xmm5, xmm7       ; t2
+@@ -402,55 +402,58 @@ cglobal fdct_sse2_skal
+ 
+ ALIGN 16
+ idct_sse2_skal:
++  push ebx
++  call get_pc.bx
++  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
+ 
+-  mov ecx, [esp+ 4]  ; Src
++  mov ecx, [esp+ 4 +4]  ; Src
+ 
+   TEST_ROW ecx, .Row0_Round
+-  iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11
++  iMTX_MULT  0, ebx + iTab1 wrt ..gotoff, ebx + 16*0 + Walken_Idct_Rounders wrt ..gotoff, 11
+   jmp .Row1
+ .Row0_Round
+-  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
++  movdqa xmm0, [ebx + 16*8 + 8*0 + Walken_Idct_Rounders wrt ..gotoff]
+   movdqa [ecx  ], xmm0
+ 
+ .Row1
+   TEST_ROW ecx+16, .Row1_Round
+-  iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11
++  iMTX_MULT  1, ebx + iTab2 wrt ..gotoff, ebx + 16*1 + Walken_Idct_Rounders wrt ..gotoff, 11
+   jmp .Row2
+ .Row1_Round
+-  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
++  movdqa xmm0, [ebx + 16*8 + 16*1 + Walken_Idct_Rounders wrt ..gotoff]
+   movdqa [ecx+16  ], xmm0
+ 
+ .Row2
+   TEST_ROW ecx+32, .Row2_Round
+-  iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11
++  iMTX_MULT  2, ebx + iTab3 wrt ..gotoff, ebx + 16*2 + Walken_Idct_Rounders wrt ..gotoff, 11
+   jmp .Row3
+ .Row2_Round
+-  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
++  movdqa xmm0, [ebx + 16*8 + 16*2 + Walken_Idct_Rounders wrt ..gotoff]
+   movdqa [ecx+32  ], xmm0
+ 
+ .Row3
+   TEST_ROW ecx+48, .Row4
+-  iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11
++  iMTX_MULT  3, ebx + iTab4 wrt ..gotoff, ebx + 16*3 + Walken_Idct_Rounders wrt ..gotoff, 11
+ 
+ .Row4
+   TEST_ROW ecx+64, .Row5
+-  iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11
++  iMTX_MULT  4, ebx + iTab1 wrt ..gotoff, ebx + 16*4 + Walken_Idct_Rounders wrt ..gotoff, 11
+ 
+ .Row5
+   TEST_ROW ecx+80, .Row6
+-  iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11
++  iMTX_MULT  5, ebx + iTab4 wrt ..gotoff, ebx + 16*5 + Walken_Idct_Rounders wrt ..gotoff, 11
+ 
+ .Row6
+   TEST_ROW ecx+96, .Row7
+-  iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11
++  iMTX_MULT  6, ebx + iTab3 wrt ..gotoff, ebx + 16*6 + Walken_Idct_Rounders wrt ..gotoff, 11
+ 
+ .Row7
+   TEST_ROW ecx+112, .End
+-  iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11
++  iMTX_MULT  7, ebx + iTab2 wrt ..gotoff, ebx + 16*7 + Walken_Idct_Rounders wrt ..gotoff, 11
+ .End
+ 
+   iLLM_PASS ecx
+-
++  pop ebx
+   ret
+ .endfunc
+ 
+@@ -507,15 +510,15 @@ idct_sse2_skal:
+   paddsw xmm2, xmm1         ; xmm2: t6+t5
+   movdqa [%1+0*16], xmm5   ; => out0
+ 
+-  movdqa xmm4, [tan2]      ; xmm4 <= tan2
++  movdqa xmm4, [ebx + tan2 wrt ..gotoff]      ; xmm4 <= tan2
+   pmulhw xmm4, xmm7         ; tm03*tan2
+-  movdqa xmm5, [tan2]      ; xmm5 <= tan2
++  movdqa xmm5, [ebx + tan2 wrt ..gotoff]      ; xmm5 <= tan2
+   psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
+   pmulhw xmm5, xmm6         ; tm12*tan2
+   paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03
+ 
+-  movdqa xmm6, [sqrt2]  
+-  movdqa xmm7, [Rounder1]
++  movdqa xmm6, [ebx + sqrt2 wrt ..gotoff]
++  movdqa xmm7, [ebx + Rounder1 wrt ..gotoff]
+ 
+   pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
+   por    xmm5, xmm7         ; correct out2
+@@ -533,8 +536,8 @@ idct_sse2_skal:
<<Diff was trimmed, longer than 597 lines>>