SOURCES: xvid-1.1.2-textrel.patch (NEW) - TEXTREL fix from gentoo:...
radek
radek at pld-linux.org
Tue May 8 23:35:45 CEST 2007
Author: radek Date: Tue May 8 21:35:45 2007 GMT
Module: SOURCES Tag: HEAD
---- Log message:
- TEXTREL fix from gentoo: http://bugs.gentoo.org/show_bug.cgi?id=135326
---- Files affected:
SOURCES:
xvid-1.1.2-textrel.patch (NONE -> 1.1) (NEW)
---- Diffs:
================================================================
Index: SOURCES/xvid-1.1.2-textrel.patch
diff -u /dev/null SOURCES/xvid-1.1.2-textrel.patch:1.1
--- /dev/null Tue May 8 23:35:45 2007
+++ SOURCES/xvid-1.1.2-textrel.patch Tue May 8 23:35:40 2007
@@ -0,0 +1,5757 @@
+diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm
+--- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_mmx.asm 2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_mmx.asm 2007-01-27 13:33:30.000000000 +0100
+@@ -50,23 +50,6 @@ BITS 32
+ %endmacro
+
+ ;=============================================================================
+-; Local data
+-;=============================================================================
+-
+-%ifdef FORMAT_COFF
+-SECTION .rodata
+-%else
+-SECTION .rodata align=16
+-%endif
+-
+-ALIGN 16
+-
+-mult_mask:
+- db 0x10,0x20,0x04,0x08,0x01,0x02,0x00,0x00
+-ignore_dc:
+- dw 0, -1, -1, -1
+-
+-;=============================================================================
+ ; Code
+ ;=============================================================================
+
+@@ -91,7 +74,12 @@ ALIGN 16
+ calc_cbp_mmx:
+ mov eax, [esp + 4] ; coeff
+
+- movq mm7, [ignore_dc]
++ push byte 0 ; align esp to 8 bytes
++ push byte -1
++ push dword 0xFFFF0000
++ movq mm7, [esp]
++ add esp, byte 8
++
+ pxor mm6, mm6 ; used only for comparing
+ movq mm0, [eax+128*0]
+ movq mm1, [eax+128*1]
+@@ -123,7 +111,11 @@ calc_cbp_mmx:
+ MAKE_LOAD 13
+ MAKE_LOAD 14
+
+- movq mm7, [mult_mask]
++ push dword 0x00000201
++ push dword 0x08042010
++ movq mm7, [esp]
++ add esp, byte 12
++
+ packssdw mm0, mm1
+ packssdw mm2, mm3
+ packssdw mm4, mm5
+diff -urp xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm
+--- xvidcore-1.1.2-old/src/bitstream/x86_asm/cbp_sse2.asm 2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/bitstream/x86_asm/cbp_sse2.asm 2007-01-27 13:33:30.000000000 +0100
+@@ -69,20 +69,6 @@ BITS 32
+ %endmacro
+
+ ;=============================================================================
+-; Data (Read Only)
+-;=============================================================================
+-
+-%ifdef FORMAT_COFF
+-SECTION .rodata
+-%else
+-SECTION .rodata align=16
+-%endif
+-
+-ALIGN 16
+-ignore_dc:
+- dw 0, -1, -1, -1, -1, -1, -1, -1
+-
+-;=============================================================================
+ ; Code
+ ;=============================================================================
+
+@@ -98,7 +84,13 @@ calc_cbp_sse2:
+ mov edx, [esp+4] ; coeff[]
+ xor eax, eax ; cbp = 0
+
+- movdqu xmm7, [ignore_dc] ; mask to ignore dc value
++ sub esp,byte 12 ; align esp to 16 bytes
++ push byte -1
++ push byte -1
++ push byte -1
++ push dword 0xFFFF0000
++ movdqu xmm7, [esp] ; mask to ignore dc value
++ add esp, byte 28
+ pxor xmm6, xmm6 ; zero
+
+ LOOP_SSE2 0
+diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm
+--- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2007-01-27 13:33:30.000000000 +0100
+@@ -204,7 +204,7 @@ fdct_r_row:
+ psllw mm4, SHIFT_FRW_COL
+ movq mm6, mm0
+ psubsw mm2, mm1
+- movq mm1, [fdct_tg_all_16 + 4*2]
++ movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
+ psubsw mm0, mm4
+ movq mm7, [%2 + %3*2 + 3*16]
+ pmulhw mm1, mm0
+@@ -216,9 +216,9 @@ fdct_r_row:
+ psubsw mm5, mm7
+ paddsw mm1, mm5
+ paddsw mm4, mm7
+- por mm1, [fdct_one_corr]
++ por mm1, [ebx + fdct_one_corr wrt ..gotoff]
+ psllw mm2, SHIFT_FRW_COL + 1
+- pmulhw mm5, [fdct_tg_all_16 + 4*2]
++ pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
+ movq mm7, mm4
+ psubsw mm3, [%2 + %3*2 + 5*16]
+ psubsw mm4, mm6
+@@ -230,34 +230,34 @@ fdct_r_row:
+ movq mm6, mm2
+ movq [%1 + %3*2 + 4*16], mm4
+ paddsw mm2, mm3
+- pmulhw mm2, [ocos_4_16]
++ pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff]
+ psubsw mm6, mm3
+- pmulhw mm6, [ocos_4_16]
++ pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff]
+ psubsw mm5, mm0
+- por mm5, [fdct_one_corr]
++ por mm5, [ebx + fdct_one_corr wrt ..gotoff]
+ psllw mm1, SHIFT_FRW_COL
+- por mm2, [fdct_one_corr]
++ por mm2, [ebx + fdct_one_corr wrt ..gotoff]
+ movq mm4, mm1
+ movq mm3, [%2 + %3*2 + 0*16]
+ paddsw mm1, mm6
+ psubsw mm3, [%2 + %3*2 + 7*16]
+ psubsw mm4, mm6
+- movq mm0, [fdct_tg_all_16 + 0*2]
++ movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
+ psllw mm3, SHIFT_FRW_COL
+- movq mm6, [fdct_tg_all_16 + 8*2]
++ movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
+ pmulhw mm0, mm1
+ movq [%1 + %3*2 + 0*16], mm7
+ pmulhw mm6, mm4
+ movq [%1 + %3*2 + 6*16], mm5
+ movq mm7, mm3
+- movq mm5, [fdct_tg_all_16 + 8*2]
++ movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
+ psubsw mm7, mm2
+ paddsw mm3, mm2
+ pmulhw mm5, mm7
+ paddsw mm0, mm3
+ paddsw mm6, mm4
+- pmulhw mm3, [fdct_tg_all_16 + 0*2]
+- por mm0, [fdct_one_corr]
++ pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
++ por mm0, [ebx + fdct_one_corr wrt ..gotoff]
+ paddsw mm5, mm7
+ psubsw mm7, mm6
+ movq [%1 + %3*2 + 1*16], mm0
+@@ -287,28 +287,28 @@ fdct_r_row:
+ movq mm6, mm5
+ punpckldq mm3, mm5
+ punpckhdq mm6, mm3
+- movq mm3, [%3 + 0*2]
+- movq mm4, [%3 + 4*2]
++ movq mm3, [0*2 + %3]
++ movq mm4, [4*2 + %3]
+ punpckldq mm2, mm0
+ pmaddwd mm3, mm0
+ punpckhdq mm1, mm2
+- movq mm2, [%3 + 16*2]
++ movq mm2, [16*2 + %3]
+ pmaddwd mm4, mm1
+- pmaddwd mm0, [%3 + 8*2]
+- movq mm7, [%3 + 20*2]
++ pmaddwd mm0, [8*2 + %3]
++ movq mm7, [20*2 + %3]
+ pmaddwd mm2, mm5
+- paddd mm3, [fdct_r_row]
++ paddd mm3, [ebx + fdct_r_row wrt ..gotoff]
+ pmaddwd mm7, mm6
+- pmaddwd mm1, [%3 + 12*2]
++ pmaddwd mm1, [12*2 + %3]
+ paddd mm3, mm4
+- pmaddwd mm5, [%3 + 24*2]
+- pmaddwd mm6, [%3 + 28*2]
++ pmaddwd mm5, [24*2 + %3]
++ pmaddwd mm6, [28*2 + %3]
+ paddd mm2, mm7
+- paddd mm0, [fdct_r_row]
++ paddd mm0, [ebx + fdct_r_row wrt ..gotoff]
+ psrad mm3, SHIFT_FRW_ROW
+- paddd mm2, [fdct_r_row]
++ paddd mm2, [ebx + fdct_r_row wrt ..gotoff]
+ paddd mm0, mm1
+- paddd mm5, [fdct_r_row]
++ paddd mm5, [ebx + fdct_r_row wrt ..gotoff]
+ psrad mm2, SHIFT_FRW_ROW
+ paddd mm5, mm6
+ psrad mm0, SHIFT_FRW_ROW
+@@ -336,23 +336,23 @@ fdct_r_row:
+ psubsw mm1, mm5
+ pshufw mm2, mm0, 0x4E
+ pshufw mm3, mm1, 0x4E
+- movq mm4, [%3 + 0*2]
+- movq mm6, [%3 + 4*2]
+- movq mm5, [%3 + 16*2]
+- movq mm7, [%3 + 20*2]
++ movq mm4, [ 0*2 + %3]
++ movq mm6, [ 4*2 + %3]
++ movq mm5, [16*2 + %3]
++ movq mm7, [20*2 + %3]
+ pmaddwd mm4, mm0
+ pmaddwd mm5, mm1
+ pmaddwd mm6, mm2
+ pmaddwd mm7, mm3
+- pmaddwd mm0, [%3 + 8*2]
+- pmaddwd mm2, [%3 + 12*2]
+- pmaddwd mm1, [%3 + 24*2]
+- pmaddwd mm3, [%3 + 28*2]
++ pmaddwd mm0, [ 8*2 + %3]
++ pmaddwd mm2, [12*2 + %3]
++ pmaddwd mm1, [24*2 + %3]
++ pmaddwd mm3, [28*2 + %3]
+ paddd mm4, mm6
+ paddd mm5, mm7
+ paddd mm0, mm2
+ paddd mm1, mm3
+- movq mm7, [fdct_r_row]
++ movq mm7, [ebx + fdct_r_row wrt ..gotoff]
+ paddd mm4, mm7
+ paddd mm5, mm7
+ paddd mm0, mm7
+@@ -377,6 +377,10 @@ cglobal %1
+ ;; Move the destination/source address to the eax register
+ mov eax, [esp + 4]
+
++ push ebx
++ call get_pc.bx
++ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
++
+ ;; Process the columns (4 at a time)
+ FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
+ FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
+@@ -386,12 +390,12 @@ cglobal %1
+ %assign i 0
+ %rep 8
+ ;; Process the 'i'th row
+- %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
++ %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff
+ %assign i i+1
+ %endrep
+ %else
+ mov ecx, 8
+- mov edx, tab_frw_01234567
++ mov edx, [ebx + tab_frw_01234567 wrt ..gotoff]
+ ALIGN 8
+ .loop
+ %2 eax, eax, edx
+@@ -401,6 +405,7 @@ ALIGN 8
+ jne .loop
+ %endif
+
++ pop ebx
+ ret
+ .endfunc
+ %endmacro
+@@ -411,6 +416,11 @@ ALIGN 8
+
+ SECTION .text
+
++extern _GLOBAL_OFFSET_TABLE_
++get_pc.bx:
++ mov ebx, [esp]
++ retn
++
+ ;-----------------------------------------------------------------------------
+ ; void fdct_mmx_ffmpeg(int16_t block[64]);
+ ;-----------------------------------------------------------------------------
+diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm
+--- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_mmx_skal.asm 2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/dct/x86_asm/fdct_mmx_skal.asm 2007-01-27 13:33:30.000000000 +0100
+@@ -294,15 +294,15 @@ MMX_One:
+ paddsw mm2, mm1 ; mm2: t6+t5
+ movq [%1+0*16], mm5 ; => out0
+
+- movq mm4, [tan2] ; mm4 <= tan2
++ movq mm4, [ebx + tan2 wrt ..gotoff] ; mm4 <= tan2
+ pmulhw mm4, mm7 ; tm03*tan2
+- movq mm5, [tan2] ; mm5 <= tan2
++ movq mm5, [ebx + tan2 wrt ..gotoff] ; mm5 <= tan2
+ psubsw mm4, mm6 ; out6 = tm03*tan2 - tm12
+ pmulhw mm5, mm6 ; tm12*tan2
+ paddsw mm5, mm7 ; out2 = tm12*tan2 + tm03
+
+- movq mm6, [sqrt2]
+- movq mm7, [MMX_One]
++ movq mm6, [ebx + sqrt2 wrt ..gotoff]
++ movq mm7, [ebx + MMX_One wrt ..gotoff]
+
+ pmulhw mm2, mm6 ; mm2: tp65 = (t6 + t5)*cos4
+ por mm5, mm7 ; correct out2
+@@ -320,8 +320,8 @@ MMX_One:
+ paddsw mm2, mm4 ; mm2: tp765 = t7 + tp65
+ paddsw mm1, mm5 ; mm1: tp465 = t4 + tm65
+
+- movq mm4, [tan3] ; tan3 - 1
+- movq mm5, [tan1] ; tan1
++ movq mm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1
++ movq mm5, [ebx + tan1 wrt ..gotoff] ; tan1
+
+ movq mm7, mm3 ; save tm465
+ pmulhw mm3, mm4 ; tm465*(tan3-1)
+@@ -364,23 +364,23 @@ MMX_One:
+ punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1]
+ punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3]
+
+- movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17]
+- movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19]
++ movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17]
++ movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19]
+ pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17]
+- movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21]
++ movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21]
+ pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19]
+- movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23]
++ movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23]
+ pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21]
+- movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25]
++ movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25]
+ pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23]
+- movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27]
++ movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27]
+ pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25]
+ paddd mm2, mm3 ; [ out0 | out1 ]
+ pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27]
+ psrad mm2, 16
+- pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
++ pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
+ paddd mm4, mm5 ; [ out2 | out3 ]
+- pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
++ pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
+ psrad mm4, 16
+
+ paddd mm6, mm7 ; [ out4 | out5 ]
+@@ -422,23 +422,23 @@ MMX_One:
+ punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1]
+ punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3]
+
+- movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17]
+- movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19]
++ movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17]
++ movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19]
+ pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17]
+- movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21]
++ movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21]
+ pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19]
+- movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23]
++ movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23]
+ pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21]
+- movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25]
++ movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25]
+ pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23]
+- movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27]
++ movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27]
+ pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25]
+ paddd mm2, mm3 ; [ out0 | out1 ]
+ pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27]
+ psrad mm2, 16
+- pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
++ pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
+ paddd mm4, mm5 ; [ out2 | out3 ]
+- pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
++ pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
+ psrad mm4, 16
+
+ paddd mm6, mm7 ; [ out4 | out5 ]
+@@ -467,12 +467,16 @@ MMX_One:
+ ALIGN 16
+ cglobal %1
+ %1:
++ push ebx
++ call get_pc.bx
++ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
++
+ %ifdef UNROLLED_LOOP
+- mov ecx, [esp + 4]
++ mov ecx, [esp + 4 + 4]
+ %else
+- push ebx
++ push esi
+ push edi
+- mov ecx, [esp + 8 + 4]
++ mov ecx, [esp + 12 + 4]
+ %endif
+
+ fLLM_PASS ecx+0, ecx+0, 3
+@@ -481,27 +485,28 @@ cglobal %1
+ %ifdef UNROLLED_LOOP
+ %assign i 0
+ %rep 8
+- %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8
++ %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff
+ %assign i i+1
+ %endrep
+ %else
+ mov eax, 8
+- mov edx, fdct_table
+- mov ebx, fdct_rounding_1
+- mov edi, fdct_rounding_2
++ lea edx, [ebx + fdct_table wrt ..gotoff]
++ lea esi, [ebx + fdct_rounding_1 wrt ..gotoff]
++ lea edi, [ebx + fdct_rounding_2 wrt ..gotoff]
+ .loop
+- %2 ecx, ecx, edx, ebx, edi
++ %2 ecx, ecx, edx, esi, edi
+ add ecx, 2*8
+ add edx, 2*32
+- add ebx, 2*4
++ add esi, 2*4
+ add edi, 2*4
+ dec eax
+ jne .loop
+
+ pop edi
+- pop ebx
++ pop esi
+ %endif
+
++ pop ebx
+ ret
+ .endfunc
+ %endmacro
+@@ -512,6 +517,11 @@ cglobal %1
+
+ SECTION .text
+
++extern _GLOBAL_OFFSET_TABLE_
++get_pc.bx:
++ mov ebx, [esp]
++ retn
++
+ ;-----------------------------------------------------------------------------
+ ; void fdct_mmx_skal(int16_t block[64]];
+ ;-----------------------------------------------------------------------------
+diff -urp xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm
+--- xvidcore-1.1.2-old/src/dct/x86_asm/fdct_sse2_skal.asm 2007-01-27 19:43:48.000000000 +0100
++++ xvidcore-1.1.2/src/dct/x86_asm/fdct_sse2_skal.asm 2007-01-27 13:33:30.000000000 +0100
+@@ -238,10 +238,10 @@ cglobal fdct_sse2_skal
+ pshufd xmm6, xmm0, 01010101b ; [13131313]
+ pshufd xmm7, xmm0, 11111111b ; [57575757]
+
+- pmaddwd xmm4, [%2+ 0] ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
+- pmaddwd xmm5, [%2+16] ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
+- pmaddwd xmm6, [%2+32] ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
+- pmaddwd xmm7, [%2+48] ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
++ pmaddwd xmm4, [ 0 + %2] ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
++ pmaddwd xmm5, [16 + %2] ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
++ pmaddwd xmm6, [32 + %2] ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
++ pmaddwd xmm7, [48 + %2] ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
+ paddd xmm4, [%3] ; Round
+
+ paddd xmm6, xmm7 ; [b0|b1|b2|b3]
+@@ -267,12 +267,12 @@ cglobal fdct_sse2_skal
+
+ %macro iLLM_PASS 1 ; %1: src/dst
+
+- movdqa xmm0, [tan3] ; t3-1
++ movdqa xmm0, [ebx + tan3 wrt ..gotoff] ; t3-1
+ movdqa xmm3, [%1+16*3] ; x3
+ movdqa xmm1, xmm0 ; t3-1
+ movdqa xmm5, [%1+16*5] ; x5
+
+- movdqa xmm4, [tan1] ; t1
++ movdqa xmm4, [ebx + tan1 wrt ..gotoff] ; t1
+ movdqa xmm6, [%1+16*1] ; x1
+ movdqa xmm7, [%1+16*7] ; x7
+ movdqa xmm2, xmm4 ; t1
+@@ -290,7 +290,7 @@ cglobal fdct_sse2_skal
+ psubsw xmm2, xmm7 ; x1*t1-x7 = tm17
+
+
+- movdqa xmm3, [sqrt2]
++ movdqa xmm3, [ebx + sqrt2 wrt ..gotoff]
+ movdqa xmm7, xmm4
+ movdqa xmm6, xmm2
+ psubsw xmm4, xmm1 ; tp17-tp35 = t1
+@@ -310,7 +310,7 @@ cglobal fdct_sse2_skal
+ paddsw xmm0, xmm0 ; 2.(t1+t2) = b1
+ paddsw xmm4, xmm4 ; 2.(t1-t2) = b2
+
+- movdqa xmm7, [tan2] ; t2
++ movdqa xmm7, [ebx + tan2 wrt ..gotoff] ; t2
+ movdqa xmm3, [%1+2*16] ; x2
+ movdqa xmm6, [%1+6*16] ; x6
+ movdqa xmm5, xmm7 ; t2
+@@ -402,55 +402,58 @@ cglobal fdct_sse2_skal
+
+ ALIGN 16
+ idct_sse2_skal:
++ push ebx
++ call get_pc.bx
++ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
+
+- mov ecx, [esp+ 4] ; Src
++ mov ecx, [esp+ 4 +4] ; Src
+
+ TEST_ROW ecx, .Row0_Round
+- iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11
++ iMTX_MULT 0, ebx + iTab1 wrt ..gotoff, ebx + 16*0 + Walken_Idct_Rounders wrt ..gotoff, 11
+ jmp .Row1
+ .Row0_Round
+- movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
++ movdqa xmm0, [ebx + 16*8 + 8*0 + Walken_Idct_Rounders wrt ..gotoff]
+ movdqa [ecx ], xmm0
+
+ .Row1
+ TEST_ROW ecx+16, .Row1_Round
+- iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11
++ iMTX_MULT 1, ebx + iTab2 wrt ..gotoff, ebx + 16*1 + Walken_Idct_Rounders wrt ..gotoff, 11
+ jmp .Row2
+ .Row1_Round
+- movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
++ movdqa xmm0, [ebx + 16*8 + 16*1 + Walken_Idct_Rounders wrt ..gotoff]
+ movdqa [ecx+16 ], xmm0
+
+ .Row2
+ TEST_ROW ecx+32, .Row2_Round
+- iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11
++ iMTX_MULT 2, ebx + iTab3 wrt ..gotoff, ebx + 16*2 + Walken_Idct_Rounders wrt ..gotoff, 11
+ jmp .Row3
+ .Row2_Round
+- movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
++ movdqa xmm0, [ebx + 16*8 + 16*2 + Walken_Idct_Rounders wrt ..gotoff]
+ movdqa [ecx+32 ], xmm0
+
+ .Row3
+ TEST_ROW ecx+48, .Row4
+- iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11
++ iMTX_MULT 3, ebx + iTab4 wrt ..gotoff, ebx + 16*3 + Walken_Idct_Rounders wrt ..gotoff, 11
+
+ .Row4
+ TEST_ROW ecx+64, .Row5
+- iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11
++ iMTX_MULT 4, ebx + iTab1 wrt ..gotoff, ebx + 16*4 + Walken_Idct_Rounders wrt ..gotoff, 11
+
+ .Row5
+ TEST_ROW ecx+80, .Row6
+- iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11
++ iMTX_MULT 5, ebx + iTab4 wrt ..gotoff, ebx + 16*5 + Walken_Idct_Rounders wrt ..gotoff, 11
+
+ .Row6
+ TEST_ROW ecx+96, .Row7
+- iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11
++ iMTX_MULT 6, ebx + iTab3 wrt ..gotoff, ebx + 16*6 + Walken_Idct_Rounders wrt ..gotoff, 11
+
+ .Row7
+ TEST_ROW ecx+112, .End
+- iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11
++ iMTX_MULT 7, ebx + iTab2 wrt ..gotoff, ebx + 16*7 + Walken_Idct_Rounders wrt ..gotoff, 11
+ .End
+
+ iLLM_PASS ecx
+-
++ pop ebx
+ ret
+ .endfunc
+
+@@ -507,15 +510,15 @@ idct_sse2_skal:
+ paddsw xmm2, xmm1 ; xmm2: t6+t5
+ movdqa [%1+0*16], xmm5 ; => out0
+
+- movdqa xmm4, [tan2] ; xmm4 <= tan2
++ movdqa xmm4, [ebx + tan2 wrt ..gotoff] ; xmm4 <= tan2
+ pmulhw xmm4, xmm7 ; tm03*tan2
+- movdqa xmm5, [tan2] ; xmm5 <= tan2
++ movdqa xmm5, [ebx + tan2 wrt ..gotoff] ; xmm5 <= tan2
+ psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12
+ pmulhw xmm5, xmm6 ; tm12*tan2
+ paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03
+
+- movdqa xmm6, [sqrt2]
+- movdqa xmm7, [Rounder1]
++ movdqa xmm6, [ebx + sqrt2 wrt ..gotoff]
++ movdqa xmm7, [ebx + Rounder1 wrt ..gotoff]
+
+ pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4
+ por xmm5, xmm7 ; correct out2
+@@ -533,8 +536,8 @@ idct_sse2_skal:
<<Diff was trimmed, longer than 597 lines>>
More information about the pld-cvs-commit
mailing list