mirror of
https://github.com/ZDoom/gzdoom.git
synced 2024-11-14 08:31:23 +00:00
e666cde418
- Updated project files for nasm 2.0, which is now named nasm.exe for the Windows version, rather than nasmw.exe. Also fixed the annoying new warnings it generated. SVN r593 (trunk)
568 lines
21 KiB
NASM
568 lines
21 KiB
NASM
; vim:filetype=nasm ts=8
|
|
|
|
; libFLAC - Free Lossless Audio Codec library
|
|
; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions
|
|
; are met:
|
|
;
|
|
; - Redistributions of source code must retain the above copyright
|
|
; notice, this list of conditions and the following disclaimer.
|
|
;
|
|
; - Redistributions in binary form must reproduce the above copyright
|
|
; notice, this list of conditions and the following disclaimer in the
|
|
; documentation and/or other materials provided with the distribution.
|
|
;
|
|
; - Neither the name of the Xiph.org Foundation nor the names of its
|
|
; contributors may be used to endorse or promote products derived from
|
|
; this software without specific prior written permission.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
|
|
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "ia32/nasm.h"
|
|
|
|
data_section
|
|
|
|
cextern FLAC__crc16_table ; unsigned FLAC__crc16_table[256];
|
|
cextern bitreader_read_from_client_ ; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
|
|
|
|
cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
|
|
|
|
code_section
|
|
|
|
|
|
; **********************************************************************
|
|
;
|
|
; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
|
|
;
|
|
; Some details like assertions and other checking is performed by the caller.
|
|
ALIGN 16
|
|
cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
|
|
|
|
;ASSERT(0 != br);
|
|
;ASSERT(0 != br->buffer);
|
|
; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
|
|
;ASSERT(FLAC__BITS_PER_WORD == 32);
|
|
;ASSERT(parameter < 32);
|
|
; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it
|
|
|
|
;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
|
|
;; [esp + 16] unsigned parameter
|
|
;; [esp + 12] unsigned nvals
|
|
;; [esp + 8] int vals[]
|
|
;; [esp + 4] FLAC__BitReader *br
|
|
mov eax, [esp + 12] ; if(nvals == 0)
|
|
test eax, eax
|
|
ja .nvals_gt_0
|
|
mov eax, 1 ; return true;
|
|
ret
|
|
|
|
.nvals_gt_0:
|
|
push ebp
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
sub esp, 4
|
|
;; [esp + 36] unsigned parameter
|
|
;; [esp + 32] unsigned nvals
|
|
;; [esp + 28] int vals[]
|
|
;; [esp + 24] FLAC__BitReader *br
|
|
;; [esp] ucbits
|
|
mov ebp, [esp + 24] ; ebp <- br == br->buffer
|
|
mov esi, [ebp + 16] ; esi <- br->consumed_words (aka 'cwords' in the C version)
|
|
mov ecx, [ebp + 20] ; ecx <- br->consumed_bits (aka 'cbits' in the C version)
|
|
xor edi, edi ; edi <- 0 'uval'
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
;; [ebp] br->buffer
|
|
;; [ebp + 8] br->words
|
|
;; [ebp + 12] br->bytes
|
|
;; [ebp + 16] br->consumed_words
|
|
;; [ebp + 20] br->consumed_bits
|
|
;; [ebp + 24] br->read_crc
|
|
;; [ebp + 28] br->crc16_align
|
|
|
|
; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
|
|
mov eax, [ebp + 8] ; eax <- br->words
|
|
sub eax, esi ; eax <- br->words-cwords
|
|
shl eax, 2 ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
|
|
add eax, [ebp + 12] ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
|
|
shl eax, 3 ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
|
|
sub eax, ecx ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
|
|
mov [esp], eax ; ucbits <- eax
|
|
|
|
ALIGN 16
|
|
.val_loop: ; while(1) {
|
|
|
|
;
|
|
; read unary part
|
|
;
|
|
.unary_loop: ; while(1) {
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
cmp esi, [ebp + 8] ; while(cwords < br->words) /* if we've not consumed up to a partial tail word... */
|
|
jae near .c1_next1
|
|
.c1_loop: ; {
|
|
mov ebx, [ebp]
|
|
mov eax, [ebx + 4*esi] ; b = br->buffer[cwords]
|
|
mov edx, eax ; edx = br->buffer[cwords] (saved for later use)
|
|
shl eax, cl ; b = br->buffer[cwords] << cbits
|
|
test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
|
|
jz near .c1_next2 ; if(b) {
|
|
bsr ebx, eax
|
|
not ebx
|
|
and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax)
|
|
add ecx, ebx ; cbits += i;
|
|
add edi, ebx ; uval += i;
|
|
add ecx, byte 1 ; cbits++; /* skip over stop bit */
|
|
test ecx, ~31
|
|
jz near .break1 ; if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
|
|
; crc16_update_word_(br, br->buffer[cwords]);
|
|
push edi ; [need more registers]
|
|
bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
|
|
mov ecx, [ebp + 28] ; ecx <- br->crc16_align
|
|
mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
mov edi, _FLAC__crc16_table
|
|
%else
|
|
mov edi, FLAC__crc16_table
|
|
%endif
|
|
;; eax (ax) crc a.k.a. br->read_crc
|
|
;; ebx (bl) intermediate result index into FLAC__crc16_table[]
|
|
;; ecx br->crc16_align
|
|
;; edx byteswapped brword to CRC
|
|
;; esi cwords
|
|
;; edi unsigned FLAC__crc16_table[]
|
|
;; ebp br
|
|
test ecx, ecx ; switch(br->crc16_align) ...
|
|
jnz .c0b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
|
|
.c0b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
.c0b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shr edx, 16
|
|
.c0b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
.c0b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
movzx eax, ax
|
|
mov [ebp + 24], eax ; br->read_crc <- crc
|
|
pop edi
|
|
|
|
add esi, byte 1 ; cwords++;
|
|
xor ecx, ecx ; cbits = 0;
|
|
; }
|
|
jmp near .break1 ; goto break1;
|
|
;; this section relocated out of the way for performance
|
|
.c0b4:
|
|
mov [ebp + 28], dword 0 ; br->crc16_align <- 0
|
|
cmp ecx, 8
|
|
je .c0b1
|
|
shr edx, 16
|
|
cmp ecx, 16
|
|
je .c0b2
|
|
jmp .c0b3
|
|
|
|
;; this section relocated out of the way for performance
|
|
.c1b4:
|
|
mov [ebp + 28], dword 0 ; br->crc16_align <- 0
|
|
cmp ecx, 8
|
|
je .c1b1
|
|
shr edx, 16
|
|
cmp ecx, 16
|
|
je .c1b2
|
|
jmp .c1b3
|
|
|
|
.c1_next2: ; } else {
|
|
;; ecx cbits
|
|
;; edx current brword 'b'
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
add edi, 32
|
|
sub edi, ecx ; uval += FLAC__BITS_PER_WORD - cbits;
|
|
; crc16_update_word_(br, br->buffer[cwords]);
|
|
push edi ; [need more registers]
|
|
bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
|
|
mov ecx, [ebp + 28] ; ecx <- br->crc16_align
|
|
mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
mov edi, _FLAC__crc16_table
|
|
%else
|
|
mov edi, FLAC__crc16_table
|
|
%endif
|
|
;; eax (ax) crc a.k.a. br->read_crc
|
|
;; ebx (bl) intermediate result index into FLAC__crc16_table[]
|
|
;; ecx br->crc16_align
|
|
;; edx byteswapped brword to CRC
|
|
;; esi cwords
|
|
;; edi unsigned FLAC__crc16_table[]
|
|
;; ebp br
|
|
test ecx, ecx ; switch(br->crc16_align) ...
|
|
jnz .c1b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
|
|
.c1b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
.c1b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shr edx, 16
|
|
.c1b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
.c1b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
movzx eax, ax
|
|
mov [ebp + 24], eax ; br->read_crc <- crc
|
|
pop edi
|
|
|
|
add esi, byte 1 ; cwords++;
|
|
xor ecx, ecx ; cbits = 0;
|
|
; /* didn't find stop bit yet, have to keep going... */
|
|
; }
|
|
|
|
cmp esi, [ebp + 8] ; } while(cwords < br->words) /* if we've not consumed up to a partial tail word... */
|
|
jb near .c1_loop
|
|
|
|
.c1_next1:
|
|
; at this point we've eaten up all the whole words; have to try
|
|
; reading through any tail bytes before calling the read callback.
|
|
; this is a repeat of the above logic adjusted for the fact we
|
|
; don't have a whole word. note though if the client is feeding
|
|
; us data a byte at a time (unlikely), br->consumed_bits may not
|
|
; be zero.
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
mov edx, [ebp + 12] ; edx <- br->bytes
|
|
test edx, edx
|
|
jz .read1 ; if(br->bytes) { [NOTE: this case is rare so it doesn't have to be all that fast ]
|
|
mov ebx, [ebp]
|
|
shl edx, 3 ; edx <- const unsigned end = br->bytes * 8;
|
|
mov eax, [ebx + 4*esi] ; b = br->buffer[cwords]
|
|
xchg edx, ecx ; [edx <- cbits , ecx <- end]
|
|
mov ebx, 0xffffffff ; ebx <- FLAC__WORD_ALL_ONES
|
|
shr ebx, cl ; ebx <- FLAC__WORD_ALL_ONES >> end
|
|
not ebx ; ebx <- ~(FLAC__WORD_ALL_ONES >> end)
|
|
xchg edx, ecx ; [edx <- end , ecx <- cbits]
|
|
and eax, ebx ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
|
|
shl eax, cl ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
|
|
test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
|
|
jz .c1_next3 ; if(b) {
|
|
bsr ebx, eax
|
|
not ebx
|
|
and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax)
|
|
add ecx, ebx ; cbits += i;
|
|
add edi, ebx ; uval += i;
|
|
add ecx, byte 1 ; cbits++; /* skip over stop bit */
|
|
jmp short .break1 ; goto break1;
|
|
.c1_next3: ; } else {
|
|
sub edi, ecx
|
|
add edi, edx ; uval += end - cbits;
|
|
add ecx, edx ; cbits += end
|
|
; /* didn't find stop bit yet, have to keep going... */
|
|
; }
|
|
; }
|
|
.read1:
|
|
; flush registers and read; bitreader_read_from_client_() does
|
|
; not touch br->consumed_bits at all but we still need to set
|
|
; it in case it fails and we have to return false.
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
mov [ebp + 16], esi ; br->consumed_words = cwords;
|
|
mov [ebp + 20], ecx ; br->consumed_bits = cbits;
|
|
push ecx ; /* save */
|
|
push ebp ; /* push br argument */
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
call _bitreader_read_from_client_
|
|
%else
|
|
call bitreader_read_from_client_
|
|
%endif
|
|
pop edx ; /* discard, unused */
|
|
pop ecx ; /* restore */
|
|
mov esi, [ebp + 16] ; cwords = br->consumed_words;
|
|
; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
|
|
mov ebx, [ebp + 8] ; ebx <- br->words
|
|
sub ebx, esi ; ebx <- br->words-cwords
|
|
shl ebx, 2 ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
|
|
add ebx, [ebp + 12] ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
|
|
shl ebx, 3 ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
|
|
sub ebx, ecx ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
|
|
add ebx, edi ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
|
|
; + uval to offset our count by the # of unary bits already
|
|
; consumed before the read, because we will add these back
|
|
; in all at once at break1
|
|
mov [esp], ebx ; ucbits <- ebx
|
|
test eax, eax ; if(!bitreader_read_from_client_(br))
|
|
jnz near .unary_loop
|
|
jmp .end ; return false; /* eax (the return value) is already 0 */
|
|
; } /* end while(1) unary part */
|
|
|
|
ALIGN 16
|
|
.break1:
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
;; [esp] ucbits
|
|
sub [esp], edi ; ucbits -= uval;
|
|
sub dword [esp], byte 1 ; ucbits--; /* account for stop bit */
|
|
|
|
;
|
|
; read binary part
|
|
;
|
|
mov ebx, [esp + 36] ; ebx <- parameter
|
|
test ebx, ebx ; if(parameter) {
|
|
jz near .break2
|
|
.read2:
|
|
cmp [esp], ebx ; while(ucbits < parameter) {
|
|
jae .c2_next1
|
|
; flush registers and read; bitreader_read_from_client_() does
|
|
; not touch br->consumed_bits at all but we still need to set
|
|
; it in case it fails and we have to return false.
|
|
mov [ebp + 16], esi ; br->consumed_words = cwords;
|
|
mov [ebp + 20], ecx ; br->consumed_bits = cbits;
|
|
push ecx ; /* save */
|
|
push ebp ; /* push br argument */
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
call _bitreader_read_from_client_
|
|
%else
|
|
call bitreader_read_from_client_
|
|
%endif
|
|
pop edx ; /* discard, unused */
|
|
pop ecx ; /* restore */
|
|
mov esi, [ebp + 16] ; cwords = br->consumed_words;
|
|
; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
|
|
mov edx, [ebp + 8] ; edx <- br->words
|
|
sub edx, esi ; edx <- br->words-cwords
|
|
shl edx, 2 ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
|
|
add edx, [ebp + 12] ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
|
|
shl edx, 3 ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
|
|
sub edx, ecx ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
|
|
mov [esp], edx ; ucbits <- edx
|
|
test eax, eax ; if(!bitreader_read_from_client_(br))
|
|
jnz .read2
|
|
jmp .end ; return false; /* eax (the return value) is already 0 */
|
|
; }
|
|
.c2_next1:
|
|
;; ebx parameter
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
;; [esp] ucbits
|
|
cmp esi, [ebp + 8] ; if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
|
|
jae near .c2_next2
|
|
test ecx, ecx ; if(cbits) {
|
|
jz near .c2_next3 ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
|
|
mov eax, 32
|
|
mov edx, [ebp]
|
|
sub eax, ecx ; const unsigned n = FLAC__BITS_PER_WORD - cbits;
|
|
mov edx, [edx + 4*esi] ; const brword word = br->buffer[cwords];
|
|
cmp ebx, eax ; if(parameter < n) {
|
|
jae .c2_next4
|
|
; uval <<= parameter;
|
|
; uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
|
|
shl edx, cl
|
|
xchg ebx, ecx
|
|
shld edi, edx, cl
|
|
add ebx, ecx ; cbits += parameter;
|
|
xchg ebx, ecx ; ebx <- parameter, ecx <- cbits
|
|
jmp .break2 ; goto break2;
|
|
; }
|
|
.c2_next4:
|
|
; uval <<= n;
|
|
; uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
|
|
%if 1
|
|
rol edx, cl ; @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
|
|
; @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
|
|
%else
|
|
shl edx, cl
|
|
%endif
|
|
xchg eax, ecx
|
|
shld edi, edx, cl
|
|
xchg eax, ecx
|
|
%if 1
|
|
ror edx, cl ; restored.
|
|
%else
|
|
mov edx, [ebp]
|
|
mov edx, [edx + 4*esi]
|
|
%endif
|
|
; crc16_update_word_(br, br->buffer[cwords]);
|
|
push edi ; [need more registers]
|
|
push ebx ; [need more registers]
|
|
push eax ; [need more registers]
|
|
bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
|
|
mov ecx, [ebp + 28] ; ecx <- br->crc16_align
|
|
mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
mov edi, _FLAC__crc16_table
|
|
%else
|
|
mov edi, FLAC__crc16_table
|
|
%endif
|
|
;; eax (ax) crc a.k.a. br->read_crc
|
|
;; ebx (bl) intermediate result index into FLAC__crc16_table[]
|
|
;; ecx br->crc16_align
|
|
;; edx byteswapped brword to CRC
|
|
;; esi cwords
|
|
;; edi unsigned FLAC__crc16_table[]
|
|
;; ebp br
|
|
test ecx, ecx ; switch(br->crc16_align) ...
|
|
jnz .c2b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
|
|
.c2b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
.c2b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shr edx, 16
|
|
.c2b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
.c2b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
movzx eax, ax
|
|
mov [ebp + 24], eax ; br->read_crc <- crc
|
|
pop eax
|
|
pop ebx
|
|
pop edi
|
|
add esi, byte 1 ; cwords++;
|
|
mov ecx, ebx
|
|
sub ecx, eax ; cbits = parameter - n;
|
|
jz .break2 ; if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
|
|
; uval <<= cbits;
|
|
; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
|
|
mov eax, [ebp]
|
|
mov eax, [eax + 4*esi]
|
|
shld edi, eax, cl
|
|
; }
|
|
jmp .break2 ; goto break2;
|
|
|
|
;; this section relocated out of the way for performance
|
|
.c2b4:
|
|
mov [ebp + 28], dword 0 ; br->crc16_align <- 0
|
|
cmp ecx, 8
|
|
je .c2b1
|
|
shr edx, 16
|
|
cmp ecx, 16
|
|
je .c2b2
|
|
jmp .c2b3
|
|
|
|
.c2_next3: ; } else {
|
|
mov ecx, ebx ; cbits = parameter;
|
|
; uval <<= cbits;
|
|
; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
|
|
mov eax, [ebp]
|
|
mov eax, [eax + 4*esi]
|
|
shld edi, eax, cl
|
|
jmp .break2 ; goto break2;
|
|
; }
|
|
.c2_next2: ; } else {
|
|
; in this case we're starting our read at a partial tail word;
|
|
; the reader has guaranteed that we have at least 'parameter'
|
|
; bits available to read, which makes this case simpler.
|
|
; uval <<= parameter;
|
|
; if(cbits) {
|
|
; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
|
|
; uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
|
|
; cbits += parameter;
|
|
; goto break2;
|
|
; } else {
|
|
; cbits = parameter;
|
|
; uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
|
|
; goto break2;
|
|
; }
|
|
; the above is much shorter in assembly:
|
|
mov eax, [ebp]
|
|
mov eax, [eax + 4*esi] ; eax <- br->buffer[cwords]
|
|
shl eax, cl ; eax <- br->buffer[cwords] << cbits
|
|
add ecx, ebx ; cbits += parameter
|
|
xchg ebx, ecx ; ebx <- cbits, ecx <- parameter
|
|
shld edi, eax, cl ; uval <<= parameter <<< 'parameter' bits of tail word
|
|
xchg ebx, ecx ; ebx <- parameter, ecx <- cbits
|
|
; }
|
|
; }
|
|
.break2:
|
|
sub [esp], ebx ; ucbits -= parameter;
|
|
|
|
;
|
|
; compose the value
|
|
;
|
|
mov ebx, [esp + 28] ; ebx <- vals
|
|
mov edx, edi ; edx <- uval
|
|
and edi, 1 ; edi <- uval & 1
|
|
shr edx, 1 ; edx <- uval >> 1
|
|
neg edi ; edi <- -(int)(uval & 1)
|
|
xor edx, edi ; edx <- (uval >> 1 ^ -(int)(uval & 1))
|
|
mov [ebx], edx ; *vals <- edx
|
|
sub dword [esp + 32], byte 1 ; --nvals;
|
|
jz .finished ; if(nvals == 0) /* jump to finish */
|
|
xor edi, edi ; uval = 0;
|
|
add dword [esp + 28], 4 ; ++vals
|
|
jmp .val_loop ; }
|
|
|
|
.finished:
|
|
mov [ebp + 16], esi ; br->consumed_words = cwords;
|
|
mov [ebp + 20], ecx ; br->consumed_bits = cbits;
|
|
mov eax, 1
|
|
.end:
|
|
add esp, 4
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
|
|
end:
|
|
|
|
%ifdef OBJ_FORMAT_elf
|
|
section .note.GNU-stack noalloc
|
|
%endif
|