;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding ; * ; * inffas32.asm is derivated from inffas86.c, with translation of assembly code ; * ; * Copyright (C) 1995-2003 Mark Adler ; * For conditions of distribution and use, see copyright notice in zlib.h ; * ; * Copyright (C) 2003 Chris Anderson <christop@charm.net> ; * Please use the copyright conditions above. ; * ; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from ; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at ; * the moment. I have successfully compiled and tested this code with gcc2.96, ; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S ; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX ; * enabled. I will attempt to merge the MMX code into this version. Newer ; * versions of this and inffast.S can be found at ; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ ; * ; * 2005 : modification by Gilles Vollant ; */ ; For Visual C++ 4.x and higher and ML 6.x and higher ; ml.exe is in directory \MASM611C of Win95 DDK ; ml.exe is also distributed in http://www.masm32.com/masmdl.htm ; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ ; ; ; compile with command line option ; ml /coff /Zi /c /Flinffas32.lst inffas32.asm ; if you define NO_GZIP (see inflate.h), compile with ; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm ; zlib122sup is 0 fort zlib 1.2.2.1 and lower ; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head ; in inflate_state in inflate.h) zlib1222sup equ 8 IFDEF GUNZIP INFLATE_MODE_TYPE equ 11 INFLATE_MODE_BAD equ 26 ELSE IFNDEF NO_GUNZIP INFLATE_MODE_TYPE equ 11 INFLATE_MODE_BAD equ 26 ELSE INFLATE_MODE_TYPE equ 3 INFLATE_MODE_BAD equ 17 ENDIF ENDIF ; 75 "inffast.S" ;FILE "inffast.S" ;;;GLOBAL _inflate_fast ;;;SECTION .text .586p .mmx name inflate_fast_x86 .MODEL FLAT _DATA segment inflate_fast_use_mmx: dd 1 _TEXT segment PUBLIC _inflate_fast ALIGN 4 _inflate_fast: jmp inflate_fast_entry ALIGN 4 db 'Fast decoding Code from Chris Anderson' db 0 ALIGN 4 invalid_literal_length_code_msg: db 'invalid literal/length code' db 0 ALIGN 4 invalid_distance_code_msg: db 'invalid distance code' db 0 ALIGN 4 invalid_distance_too_far_msg: db 'invalid distance too far back' db 0 ALIGN 4 inflate_fast_mask: dd 0 dd 1 dd 3 dd 7 dd 15 dd 31 dd 63 dd 127 dd 255 dd 511 dd 1023 dd 2047 dd 4095 dd 8191 dd 16383 dd 32767 dd 65535 dd 131071 dd 262143 dd 524287 dd 1048575 dd 2097151 dd 4194303 dd 8388607 dd 16777215 dd 33554431 dd 67108863 dd 134217727 dd 268435455 dd 536870911 dd 1073741823 dd 2147483647 dd 4294967295 mode_state equ 0 ;/* state->mode */ wsize_state equ (32+zlib1222sup) ;/* state->wsize */ write_state equ (36+4+zlib1222sup) ;/* state->write */ window_state equ (40+4+zlib1222sup) ;/* state->window */ hold_state equ (44+4+zlib1222sup) ;/* state->hold */ bits_state equ (48+4+zlib1222sup) ;/* state->bits */ lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ ;;SECTION .text ; 205 "inffast.S" ;GLOBAL inflate_fast_use_mmx ;SECTION .data ; GLOBAL inflate_fast_use_mmx:object ;.size inflate_fast_use_mmx, 4 ; 226 "inffast.S" ;SECTION .text ALIGN 4 inflate_fast_entry: push edi push esi push ebp push ebx pushfd sub esp,64 cld mov esi, [esp+88] mov edi, [esi+28] mov edx, [esi+4] mov eax, [esi+0] add edx,eax sub edx,11 mov [esp+44],eax mov [esp+20],edx mov ebp, [esp+92] mov ecx, [esi+16] mov ebx, [esi+12] sub ebp,ecx neg ebp add ebp,ebx sub ecx,257 add ecx,ebx mov [esp+60],ebx mov [esp+40],ebp mov [esp+16],ecx ; 285 "inffast.S" mov eax, [edi+lencode_state] mov ecx, [edi+distcode_state] mov [esp+8],eax mov [esp+12],ecx mov eax,1 mov ecx, [edi+lenbits_state] shl eax,cl dec eax mov [esp+0],eax mov eax,1 mov ecx, [edi+distbits_state] shl eax,cl dec eax mov [esp+4],eax mov eax, [edi+wsize_state] mov ecx, [edi+write_state] mov edx, [edi+window_state] mov [esp+52],eax mov [esp+48],ecx mov [esp+56],edx mov ebp, [edi+hold_state] mov ebx, [edi+bits_state] ; 321 "inffast.S" mov esi, [esp+44] mov ecx, [esp+20] cmp ecx,esi ja L_align_long add ecx,11 sub ecx,esi mov eax,12 sub eax,ecx lea edi, [esp+28] rep movsb mov ecx,eax xor eax,eax rep stosb lea esi, [esp+28] mov [esp+20],esi jmp L_is_aligned L_align_long: test esi,3 jz L_is_aligned xor eax,eax mov al, [esi] inc esi mov ecx,ebx add ebx,8 shl eax,cl or ebp,eax jmp L_align_long L_is_aligned: mov edi, [esp+60] ; 366 "inffast.S" L_check_mmx: cmp dword ptr [inflate_fast_use_mmx],2 je L_init_mmx ja L_do_loop push eax push ebx push ecx push edx pushfd mov eax, [esp] xor dword ptr [esp],0200000h popfd pushfd pop edx xor edx,eax jz L_dont_use_mmx xor eax,eax cpuid cmp ebx,0756e6547h jne L_dont_use_mmx cmp ecx,06c65746eh jne L_dont_use_mmx cmp edx,049656e69h jne L_dont_use_mmx mov eax,1 cpuid shr eax,8 and eax,15 cmp eax,6 jne L_dont_use_mmx test edx,0800000h jnz L_use_mmx jmp L_dont_use_mmx L_use_mmx: mov dword ptr [inflate_fast_use_mmx],2 jmp L_check_mmx_pop L_dont_use_mmx: mov dword ptr [inflate_fast_use_mmx],3 L_check_mmx_pop: pop edx pop ecx pop ebx pop eax jmp L_check_mmx ; 426 "inffast.S" ALIGN 4 L_do_loop: ; 437 "inffast.S" cmp bl,15 ja L_get_length_code xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax L_get_length_code: mov edx, [esp+0] mov ecx, [esp+8] and edx,ebp mov eax, [ecx+edx*4] L_dolen: mov cl,ah sub bl,ah shr ebp,cl test al,al jnz L_test_for_length_base shr eax,16 stosb L_while_test: cmp [esp+16],edi jbe L_break_loop cmp [esp+20],esi ja L_do_loop jmp L_break_loop L_test_for_length_base: ; 502 "inffast.S" mov edx,eax shr edx,16 mov cl,al test al,16 jz L_test_for_second_level_length and cl,15 jz L_save_len cmp bl,cl jae L_add_bits_to_len mov ch,cl xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax mov cl,ch L_add_bits_to_len: mov eax,1 shl eax,cl dec eax sub bl,cl and eax,ebp shr ebp,cl add edx,eax L_save_len: mov [esp+24],edx L_decode_distance: ; 549 "inffast.S" cmp bl,15 ja L_get_distance_code xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax L_get_distance_code: mov edx, [esp+4] mov ecx, [esp+12] and edx,ebp mov eax, [ecx+edx*4] L_dodist: mov edx,eax shr edx,16 mov cl,ah sub bl,ah shr ebp,cl ; 584 "inffast.S" mov cl,al test al,16 jz L_test_for_second_level_dist and cl,15 jz L_check_dist_one cmp bl,cl jae L_add_bits_to_dist mov ch,cl xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax mov cl,ch L_add_bits_to_dist: mov eax,1 shl eax,cl dec eax sub bl,cl and eax,ebp shr ebp,cl add edx,eax jmp L_check_window L_check_window: ; 625 "inffast.S" mov [esp+44],esi mov eax,edi sub eax, [esp+40] cmp eax,edx jb L_clip_window mov ecx, [esp+24] mov esi,edi sub esi,edx sub ecx,3 mov al, [esi] mov [edi],al mov al, [esi+1] mov dl, [esi+2] add esi,3 mov [edi+1],al mov [edi+2],dl add edi,3 rep movsb mov esi, [esp+44] jmp L_while_test ALIGN 4 L_check_dist_one: cmp edx,1 jne L_check_window cmp [esp+40],edi je L_check_window dec edi mov ecx, [esp+24] mov al, [edi] sub ecx,3 mov [edi+1],al mov [edi+2],al mov [edi+3],al add edi,4 rep stosb jmp L_while_test ALIGN 4 L_test_for_second_level_length: test al,64 jnz L_test_for_end_of_block mov eax,1 shl eax,cl dec eax and eax,ebp add eax,edx mov edx, [esp+8] mov eax, [edx+eax*4] jmp L_dolen ALIGN 4 L_test_for_second_level_dist: test al,64 jnz L_invalid_distance_code mov eax,1 shl eax,cl dec eax and eax,ebp add eax,edx mov edx, [esp+12] mov eax, [edx+eax*4] jmp L_dodist ALIGN 4 L_clip_window: ; 721 "inffast.S" mov ecx,eax mov eax, [esp+52] neg ecx mov esi, [esp+56] cmp eax,edx jb L_invalid_distance_too_far add ecx,edx cmp dword ptr [esp+48],0 jne L_wrap_around_window sub eax,ecx add esi,eax ; 749 "inffast.S" mov eax, [esp+24] cmp eax,ecx jbe L_do_copy1 sub eax,ecx rep movsb mov esi,edi sub esi,edx jmp L_do_copy1 cmp eax,ecx jbe L_do_copy1 sub eax,ecx rep movsb mov esi,edi sub esi,edx jmp L_do_copy1 L_wrap_around_window: ; 793 "inffast.S" mov eax, [esp+48] cmp ecx,eax jbe L_contiguous_in_window add esi, [esp+52] add esi,eax sub esi,ecx sub ecx,eax mov eax, [esp+24] cmp eax,ecx jbe L_do_copy1 sub eax,ecx rep movsb mov esi, [esp+56] mov ecx, [esp+48] cmp eax,ecx jbe L_do_copy1 sub eax,ecx rep movsb mov esi,edi sub esi,edx jmp L_do_copy1 L_contiguous_in_window: ; 836 "inffast.S" add esi,eax sub esi,ecx mov eax, [esp+24] cmp eax,ecx jbe L_do_copy1 sub eax,ecx rep movsb mov esi,edi sub esi,edx L_do_copy1: ; 862 "inffast.S" mov ecx,eax rep movsb mov esi, [esp+44] jmp L_while_test ; 878 "inffast.S" ALIGN 4 L_init_mmx: emms movd mm0,ebp mov ebp,ebx ; 896 "inffast.S" movd mm4,dword ptr [esp+0] movq mm3,mm4 movd mm5,dword ptr [esp+4] movq mm2,mm5 pxor mm1,mm1 mov ebx, [esp+8] jmp L_do_loop_mmx ALIGN 4 L_do_loop_mmx: psrlq mm0,mm1 cmp ebp,32 ja L_get_length_code_mmx movd mm6,ebp movd mm7,dword ptr [esi] add esi,4 psllq mm7,mm6 add ebp,32 por mm0,mm7 L_get_length_code_mmx: pand mm4,mm0 movd eax,mm4 movq mm4,mm3 mov eax, [ebx+eax*4] L_dolen_mmx: movzx ecx,ah movd mm1,ecx sub ebp,ecx test al,al jnz L_test_for_length_base_mmx shr eax,16 stosb L_while_test_mmx: cmp [esp+16],edi jbe L_break_loop cmp [esp+20],esi ja L_do_loop_mmx jmp L_break_loop L_test_for_length_base_mmx: mov edx,eax shr edx,16 test al,16 jz L_test_for_second_level_length_mmx and eax,15 jz L_decode_distance_mmx psrlq mm0,mm1 movd mm1,eax movd ecx,mm0 sub ebp,eax and ecx, [inflate_fast_mask+eax*4] add edx,ecx L_decode_distance_mmx: psrlq mm0,mm1 cmp ebp,32 ja L_get_dist_code_mmx movd mm6,ebp movd mm7,dword ptr [esi] add esi,4 psllq mm7,mm6 add ebp,32 por mm0,mm7 L_get_dist_code_mmx: mov ebx, [esp+12] pand mm5,mm0 movd eax,mm5 movq mm5,mm2 mov eax, [ebx+eax*4] L_dodist_mmx: movzx ecx,ah mov ebx,eax shr ebx,16 sub ebp,ecx movd mm1,ecx test al,16 jz L_test_for_second_level_dist_mmx and eax,15 jz L_check_dist_one_mmx L_add_bits_to_dist_mmx: psrlq mm0,mm1 movd mm1,eax movd ecx,mm0 sub ebp,eax and ecx, [inflate_fast_mask+eax*4] add ebx,ecx L_check_window_mmx: mov [esp+44],esi mov eax,edi sub eax, [esp+40] cmp eax,ebx jb L_clip_window_mmx mov ecx,edx mov esi,edi sub esi,ebx sub ecx,3 mov al, [esi] mov [edi],al mov al, [esi+1] mov dl, [esi+2] add esi,3 mov [edi+1],al mov [edi+2],dl add edi,3 rep movsb mov esi, [esp+44] mov ebx, [esp+8] jmp L_while_test_mmx ALIGN 4 L_check_dist_one_mmx: cmp ebx,1 jne L_check_window_mmx cmp [esp+40],edi je L_check_window_mmx dec edi mov ecx,edx mov al, [edi] sub ecx,3 mov [edi+1],al mov [edi+2],al mov [edi+3],al add edi,4 rep stosb mov ebx, [esp+8] jmp L_while_test_mmx ALIGN 4 L_test_for_second_level_length_mmx: test al,64 jnz L_test_for_end_of_block and eax,15 psrlq mm0,mm1 movd ecx,mm0 and ecx, [inflate_fast_mask+eax*4] add ecx,edx mov eax, [ebx+ecx*4] jmp L_dolen_mmx ALIGN 4 L_test_for_second_level_dist_mmx: test al,64 jnz L_invalid_distance_code and eax,15 psrlq mm0,mm1 movd ecx,mm0 and ecx, [inflate_fast_mask+eax*4] mov eax, [esp+12] add ecx,ebx mov eax, [eax+ecx*4] jmp L_dodist_mmx ALIGN 4 L_clip_window_mmx: mov ecx,eax mov eax, [esp+52] neg ecx mov esi, [esp+56] cmp eax,ebx jb L_invalid_distance_too_far add ecx,ebx cmp dword ptr [esp+48],0 jne L_wrap_around_window_mmx sub eax,ecx add esi,eax cmp edx,ecx jbe L_do_copy1_mmx sub edx,ecx rep movsb mov esi,edi sub esi,ebx jmp L_do_copy1_mmx cmp edx,ecx jbe L_do_copy1_mmx sub edx,ecx rep movsb mov esi,edi sub esi,ebx jmp L_do_copy1_mmx L_wrap_around_window_mmx: mov eax, [esp+48] cmp ecx,eax jbe L_contiguous_in_window_mmx add esi, [esp+52] add esi,eax sub esi,ecx sub ecx,eax cmp edx,ecx jbe L_do_copy1_mmx sub edx,ecx rep movsb mov esi, [esp+56] mov ecx, [esp+48] cmp edx,ecx jbe L_do_copy1_mmx sub edx,ecx rep movsb mov esi,edi sub esi,ebx jmp L_do_copy1_mmx L_contiguous_in_window_mmx: add esi,eax sub esi,ecx cmp edx,ecx jbe L_do_copy1_mmx sub edx,ecx rep movsb mov esi,edi sub esi,ebx L_do_copy1_mmx: mov ecx,edx rep movsb mov esi, [esp+44] mov ebx, [esp+8] jmp L_while_test_mmx ; 1174 "inffast.S" L_invalid_distance_code: mov ecx, invalid_distance_code_msg mov edx,INFLATE_MODE_BAD jmp L_update_stream_state L_test_for_end_of_block: test al,32 jz L_invalid_literal_length_code mov ecx,0 mov edx,INFLATE_MODE_TYPE jmp L_update_stream_state L_invalid_literal_length_code: mov ecx, invalid_literal_length_code_msg mov edx,INFLATE_MODE_BAD jmp L_update_stream_state L_invalid_distance_too_far: mov esi, [esp+44] mov ecx, invalid_distance_too_far_msg mov edx,INFLATE_MODE_BAD jmp L_update_stream_state L_update_stream_state: mov eax, [esp+88] test ecx,ecx jz L_skip_msg mov [eax+24],ecx L_skip_msg: mov eax, [eax+28] mov [eax+mode_state],edx jmp L_break_loop ALIGN 4 L_break_loop: ; 1243 "inffast.S" cmp dword ptr [inflate_fast_use_mmx],2 jne L_update_next_in mov ebx,ebp L_update_next_in: ; 1266 "inffast.S" mov eax, [esp+88] mov ecx,ebx mov edx, [eax+28] shr ecx,3 sub esi,ecx shl ecx,3 sub ebx,ecx mov [eax+12],edi mov [edx+bits_state],ebx mov ecx,ebx lea ebx, [esp+28] cmp [esp+20],ebx jne L_buf_not_used sub esi,ebx mov ebx, [eax+0] mov [esp+20],ebx add esi,ebx mov ebx, [eax+4] sub ebx,11 add [esp+20],ebx L_buf_not_used: mov [eax+0],esi mov ebx,1 shl ebx,cl dec ebx cmp dword ptr [inflate_fast_use_mmx],2 jne L_update_hold psrlq mm0,mm1 movd ebp,mm0 emms L_update_hold: and ebp,ebx mov [edx+hold_state],ebp mov ebx, [esp+20] cmp ebx,esi jbe L_last_is_smaller sub ebx,esi add ebx,11 mov [eax+4],ebx jmp L_fixup_out L_last_is_smaller: sub esi,ebx neg esi add esi,11 mov [eax+4],esi L_fixup_out: mov ebx, [esp+16] cmp ebx,edi jbe L_end_is_smaller sub ebx,edi add ebx,257 mov [eax+16],ebx jmp L_done L_end_is_smaller: sub edi,ebx neg edi add edi,257 mov [eax+16],edi L_done: add esp,64 popfd pop ebx pop ebp pop esi pop edi ret _TEXT ends end