mirror of
https://github.com/ZDoom/gzdoom-gles.git
synced 2024-11-07 05:21:18 +00:00
2e56925c36
installation, not the bundled copies which might not match what is installed. - Upgraded bundled FLAC from version 1.1.2 to version 1.2.1. SVN r575 (trunk)
568 lines
21 KiB
NASM
568 lines
21 KiB
NASM
; vim:filetype=nasm ts=8
|
|
|
|
; libFLAC - Free Lossless Audio Codec library
|
|
; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions
|
|
; are met:
|
|
;
|
|
; - Redistributions of source code must retain the above copyright
|
|
; notice, this list of conditions and the following disclaimer.
|
|
;
|
|
; - Redistributions in binary form must reproduce the above copyright
|
|
; notice, this list of conditions and the following disclaimer in the
|
|
; documentation and/or other materials provided with the distribution.
|
|
;
|
|
; - Neither the name of the Xiph.org Foundation nor the names of its
|
|
; contributors may be used to endorse or promote products derived from
|
|
; this software without specific prior written permission.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
|
|
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "ia32/nasm.h"
|
|
|
|
data_section
|
|
|
|
cextern FLAC__crc16_table ; unsigned FLAC__crc16_table[256];
|
|
cextern bitreader_read_from_client_ ; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
|
|
|
|
cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
|
|
|
|
code_section
|
|
|
|
|
|
; **********************************************************************
|
|
;
|
|
; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
|
|
;
|
|
; Some details like assertions and other checking is performed by the caller.
|
|
ALIGN 16
|
|
cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
|
|
|
|
;ASSERT(0 != br);
|
|
;ASSERT(0 != br->buffer);
|
|
; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
|
|
;ASSERT(FLAC__BITS_PER_WORD == 32);
|
|
;ASSERT(parameter < 32);
|
|
; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it
|
|
|
|
;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
|
|
;; [esp + 16] unsigned parameter
|
|
;; [esp + 12] unsigned nvals
|
|
;; [esp + 8] int vals[]
|
|
;; [esp + 4] FLAC__BitReader *br
|
|
mov eax, [esp + 12] ; if(nvals == 0)
|
|
test eax, eax
|
|
ja .nvals_gt_0
|
|
mov eax, 1 ; return true;
|
|
ret
|
|
|
|
.nvals_gt_0:
|
|
push ebp
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
sub esp, 4
|
|
;; [esp + 36] unsigned parameter
|
|
;; [esp + 32] unsigned nvals
|
|
;; [esp + 28] int vals[]
|
|
;; [esp + 24] FLAC__BitReader *br
|
|
;; [esp] ucbits
|
|
mov ebp, [esp + 24] ; ebp <- br == br->buffer
|
|
mov esi, [ebp + 16] ; esi <- br->consumed_words (aka 'cwords' in the C version)
|
|
mov ecx, [ebp + 20] ; ecx <- br->consumed_bits (aka 'cbits' in the C version)
|
|
xor edi, edi ; edi <- 0 'uval'
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
;; [ebp] br->buffer
|
|
;; [ebp + 8] br->words
|
|
;; [ebp + 12] br->bytes
|
|
;; [ebp + 16] br->consumed_words
|
|
;; [ebp + 20] br->consumed_bits
|
|
;; [ebp + 24] br->read_crc
|
|
;; [ebp + 28] br->crc16_align
|
|
|
|
; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
|
|
mov eax, [ebp + 8] ; eax <- br->words
|
|
sub eax, esi ; eax <- br->words-cwords
|
|
shl eax, 2 ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
|
|
add eax, [ebp + 12] ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
|
|
shl eax, 3 ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
|
|
sub eax, ecx ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
|
|
mov [esp], eax ; ucbits <- eax
|
|
|
|
ALIGN 16
|
|
.val_loop: ; while(1) {
|
|
|
|
;
|
|
; read unary part
|
|
;
|
|
.unary_loop: ; while(1) {
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
cmp esi, [ebp + 8] ; while(cwords < br->words) /* if we've not consumed up to a partial tail word... */
|
|
jae near .c1_next1
|
|
.c1_loop: ; {
|
|
mov ebx, [ebp]
|
|
mov eax, [ebx + 4*esi] ; b = br->buffer[cwords]
|
|
mov edx, eax ; edx = br->buffer[cwords] (saved for later use)
|
|
shl eax, cl ; b = br->buffer[cwords] << cbits
|
|
test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
|
|
jz near .c1_next2 ; if(b) {
|
|
bsr ebx, eax
|
|
not ebx
|
|
and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax)
|
|
add ecx, ebx ; cbits += i;
|
|
add edi, ebx ; uval += i;
|
|
add ecx, byte 1 ; cbits++; /* skip over stop bit */
|
|
test ecx, ~31
|
|
jz near .break1 ; if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
|
|
; crc16_update_word_(br, br->buffer[cwords]);
|
|
push edi ; [need more registers]
|
|
bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
|
|
mov ecx, [ebp + 28] ; ecx <- br->crc16_align
|
|
mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
mov edi, _FLAC__crc16_table
|
|
%else
|
|
mov edi, FLAC__crc16_table
|
|
%endif
|
|
;; eax (ax) crc a.k.a. br->read_crc
|
|
;; ebx (bl) intermediate result index into FLAC__crc16_table[]
|
|
;; ecx br->crc16_align
|
|
;; edx byteswapped brword to CRC
|
|
;; esi cwords
|
|
;; edi unsigned FLAC__crc16_table[]
|
|
;; ebp br
|
|
test ecx, ecx ; switch(br->crc16_align) ...
|
|
jnz .c0b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
|
|
.c0b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
.c0b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shr edx, 16
|
|
.c0b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
.c0b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
movzx eax, ax
|
|
mov [ebp + 24], eax ; br->read_crc <- crc
|
|
pop edi
|
|
|
|
add esi, byte 1 ; cwords++;
|
|
xor ecx, ecx ; cbits = 0;
|
|
; }
|
|
jmp near .break1 ; goto break1;
|
|
;; this section relocated out of the way for performance
|
|
.c0b4:
|
|
mov [ebp + 28], dword 0 ; br->crc16_align <- 0
|
|
cmp ecx, 8
|
|
je .c0b1
|
|
shr edx, 16
|
|
cmp ecx, 16
|
|
je .c0b2
|
|
jmp .c0b3
|
|
|
|
;; this section relocated out of the way for performance
|
|
.c1b4:
|
|
mov [ebp + 28], dword 0 ; br->crc16_align <- 0
|
|
cmp ecx, 8
|
|
je .c1b1
|
|
shr edx, 16
|
|
cmp ecx, 16
|
|
je .c1b2
|
|
jmp .c1b3
|
|
|
|
.c1_next2: ; } else {
|
|
;; ecx cbits
|
|
;; edx current brword 'b'
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
add edi, 32
|
|
sub edi, ecx ; uval += FLAC__BITS_PER_WORD - cbits;
|
|
; crc16_update_word_(br, br->buffer[cwords]);
|
|
push edi ; [need more registers]
|
|
bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
|
|
mov ecx, [ebp + 28] ; ecx <- br->crc16_align
|
|
mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
mov edi, _FLAC__crc16_table
|
|
%else
|
|
mov edi, FLAC__crc16_table
|
|
%endif
|
|
;; eax (ax) crc a.k.a. br->read_crc
|
|
;; ebx (bl) intermediate result index into FLAC__crc16_table[]
|
|
;; ecx br->crc16_align
|
|
;; edx byteswapped brword to CRC
|
|
;; esi cwords
|
|
;; edi unsigned FLAC__crc16_table[]
|
|
;; ebp br
|
|
test ecx, ecx ; switch(br->crc16_align) ...
|
|
jnz .c1b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
|
|
.c1b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
.c1b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shr edx, 16
|
|
.c1b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
.c1b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
movzx eax, ax
|
|
mov [ebp + 24], eax ; br->read_crc <- crc
|
|
pop edi
|
|
|
|
add esi, byte 1 ; cwords++;
|
|
xor ecx, ecx ; cbits = 0;
|
|
; /* didn't find stop bit yet, have to keep going... */
|
|
; }
|
|
|
|
cmp esi, [ebp + 8] ; } while(cwords < br->words) /* if we've not consumed up to a partial tail word... */
|
|
jb near .c1_loop
|
|
|
|
.c1_next1:
|
|
; at this point we've eaten up all the whole words; have to try
|
|
; reading through any tail bytes before calling the read callback.
|
|
; this is a repeat of the above logic adjusted for the fact we
|
|
; don't have a whole word. note though if the client is feeding
|
|
; us data a byte at a time (unlikely), br->consumed_bits may not
|
|
; be zero.
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
mov edx, [ebp + 12] ; edx <- br->bytes
|
|
test edx, edx
|
|
jz .read1 ; if(br->bytes) { [NOTE: this case is rare so it doesn't have to be all that fast ]
|
|
mov ebx, [ebp]
|
|
shl edx, 3 ; edx <- const unsigned end = br->bytes * 8;
|
|
mov eax, [ebx + 4*esi] ; b = br->buffer[cwords]
|
|
xchg edx, ecx ; [edx <- cbits , ecx <- end]
|
|
mov ebx, 0xffffffff ; ebx <- FLAC__WORD_ALL_ONES
|
|
shr ebx, cl ; ebx <- FLAC__WORD_ALL_ONES >> end
|
|
not ebx ; ebx <- ~(FLAC__WORD_ALL_ONES >> end)
|
|
xchg edx, ecx ; [edx <- end , ecx <- cbits]
|
|
and eax, ebx ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
|
|
shl eax, cl ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
|
|
test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
|
|
jz .c1_next3 ; if(b) {
|
|
bsr ebx, eax
|
|
not ebx
|
|
and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax)
|
|
add ecx, ebx ; cbits += i;
|
|
add edi, ebx ; uval += i;
|
|
add ecx, byte 1 ; cbits++; /* skip over stop bit */
|
|
jmp short .break1 ; goto break1;
|
|
.c1_next3: ; } else {
|
|
sub edi, ecx
|
|
add edi, edx ; uval += end - cbits;
|
|
add ecx, edx ; cbits += end
|
|
; /* didn't find stop bit yet, have to keep going... */
|
|
; }
|
|
; }
|
|
.read1:
|
|
; flush registers and read; bitreader_read_from_client_() does
|
|
; not touch br->consumed_bits at all but we still need to set
|
|
; it in case it fails and we have to return false.
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
mov [ebp + 16], esi ; br->consumed_words = cwords;
|
|
mov [ebp + 20], ecx ; br->consumed_bits = cbits;
|
|
push ecx ; /* save */
|
|
push ebp ; /* push br argument */
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
call _bitreader_read_from_client_
|
|
%else
|
|
call bitreader_read_from_client_
|
|
%endif
|
|
pop edx ; /* discard, unused */
|
|
pop ecx ; /* restore */
|
|
mov esi, [ebp + 16] ; cwords = br->consumed_words;
|
|
; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
|
|
mov ebx, [ebp + 8] ; ebx <- br->words
|
|
sub ebx, esi ; ebx <- br->words-cwords
|
|
shl ebx, 2 ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
|
|
add ebx, [ebp + 12] ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
|
|
shl ebx, 3 ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
|
|
sub ebx, ecx ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
|
|
add ebx, edi ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
|
|
; + uval to offset our count by the # of unary bits already
|
|
; consumed before the read, because we will add these back
|
|
; in all at once at break1
|
|
mov [esp], ebx ; ucbits <- ebx
|
|
test eax, eax ; if(!bitreader_read_from_client_(br))
|
|
jnz near .unary_loop
|
|
jmp .end ; return false; /* eax (the return value) is already 0 */
|
|
; } /* end while(1) unary part */
|
|
|
|
ALIGN 16
|
|
.break1:
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
;; [esp] ucbits
|
|
sub [esp], edi ; ucbits -= uval;
|
|
sub dword [esp], byte 1 ; ucbits--; /* account for stop bit */
|
|
|
|
;
|
|
; read binary part
|
|
;
|
|
mov ebx, [esp + 36] ; ebx <- parameter
|
|
test ebx, ebx ; if(parameter) {
|
|
jz near .break2
|
|
.read2:
|
|
cmp [esp], ebx ; while(ucbits < parameter) {
|
|
jae .c2_next1
|
|
; flush registers and read; bitreader_read_from_client_() does
|
|
; not touch br->consumed_bits at all but we still need to set
|
|
; it in case it fails and we have to return false.
|
|
mov [ebp + 16], esi ; br->consumed_words = cwords;
|
|
mov [ebp + 20], ecx ; br->consumed_bits = cbits;
|
|
push ecx ; /* save */
|
|
push ebp ; /* push br argument */
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
call _bitreader_read_from_client_
|
|
%else
|
|
call bitreader_read_from_client_
|
|
%endif
|
|
pop edx ; /* discard, unused */
|
|
pop ecx ; /* restore */
|
|
mov esi, [ebp + 16] ; cwords = br->consumed_words;
|
|
; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
|
|
mov edx, [ebp + 8] ; edx <- br->words
|
|
sub edx, esi ; edx <- br->words-cwords
|
|
shl edx, 2 ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
|
|
add edx, [ebp + 12] ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
|
|
shl edx, 3 ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
|
|
sub edx, ecx ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
|
|
mov [esp], edx ; ucbits <- edx
|
|
test eax, eax ; if(!bitreader_read_from_client_(br))
|
|
jnz .read2
|
|
jmp .end ; return false; /* eax (the return value) is already 0 */
|
|
; }
|
|
.c2_next1:
|
|
;; ebx parameter
|
|
;; ecx cbits
|
|
;; esi cwords
|
|
;; edi uval
|
|
;; ebp br
|
|
;; [esp] ucbits
|
|
cmp esi, [ebp + 8] ; if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
|
|
jae near .c2_next2
|
|
test ecx, ecx ; if(cbits) {
|
|
jz near .c2_next3 ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
|
|
mov eax, 32
|
|
mov edx, [ebp]
|
|
sub eax, ecx ; const unsigned n = FLAC__BITS_PER_WORD - cbits;
|
|
mov edx, [edx + 4*esi] ; const brword word = br->buffer[cwords];
|
|
cmp ebx, eax ; if(parameter < n) {
|
|
jae .c2_next4
|
|
; uval <<= parameter;
|
|
; uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
|
|
shl edx, cl
|
|
xchg ebx, ecx
|
|
shld edi, edx, cl
|
|
add ebx, ecx ; cbits += parameter;
|
|
xchg ebx, ecx ; ebx <- parameter, ecx <- cbits
|
|
jmp .break2 ; goto break2;
|
|
; }
|
|
.c2_next4:
|
|
; uval <<= n;
|
|
; uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
|
|
%if 1
|
|
rol edx, cl ; @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
|
|
; @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
|
|
%else
|
|
shl edx, cl
|
|
%endif
|
|
xchg eax, ecx
|
|
shld edi, edx, cl
|
|
xchg eax, ecx
|
|
%if 1
|
|
ror edx, cl ; restored.
|
|
%else
|
|
mov edx, [ebp]
|
|
mov edx, [edx + 4*esi]
|
|
%endif
|
|
; crc16_update_word_(br, br->buffer[cwords]);
|
|
push edi ; [need more registers]
|
|
push ebx ; [need more registers]
|
|
push eax ; [need more registers]
|
|
bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
|
|
mov ecx, [ebp + 28] ; ecx <- br->crc16_align
|
|
mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
|
|
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
|
|
mov edi, _FLAC__crc16_table
|
|
%else
|
|
mov edi, FLAC__crc16_table
|
|
%endif
|
|
;; eax (ax) crc a.k.a. br->read_crc
|
|
;; ebx (bl) intermediate result index into FLAC__crc16_table[]
|
|
;; ecx br->crc16_align
|
|
;; edx byteswapped brword to CRC
|
|
;; esi cwords
|
|
;; edi unsigned FLAC__crc16_table[]
|
|
;; ebp br
|
|
test ecx, ecx ; switch(br->crc16_align) ...
|
|
jnz .c2b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
|
|
.c2b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
|
|
.c2b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
|
|
shr edx, 16
|
|
.c2b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
|
|
movzx ebx, dl
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
|
|
.c2b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
|
|
movzx ebx, dh
|
|
mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
shl eax, 8 ; ax <- (crc<<8)
|
|
xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
|
|
movzx eax, ax
|
|
mov [ebp + 24], eax ; br->read_crc <- crc
|
|
pop eax
|
|
pop ebx
|
|
pop edi
|
|
add esi, byte 1 ; cwords++;
|
|
mov ecx, ebx
|
|
sub ecx, eax ; cbits = parameter - n;
|
|
jz .break2 ; if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
|
|
; uval <<= cbits;
|
|
; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
|
|
mov eax, [ebp]
|
|
mov eax, [eax + 4*esi]
|
|
shld edi, eax, cl
|
|
; }
|
|
jmp .break2 ; goto break2;
|
|
|
|
;; this section relocated out of the way for performance
|
|
.c2b4:
|
|
mov [ebp + 28], dword 0 ; br->crc16_align <- 0
|
|
cmp ecx, 8
|
|
je .c2b1
|
|
shr edx, 16
|
|
cmp ecx, 16
|
|
je .c2b2
|
|
jmp .c2b3
|
|
|
|
.c2_next3: ; } else {
|
|
mov ecx, ebx ; cbits = parameter;
|
|
; uval <<= cbits;
|
|
; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
|
|
mov eax, [ebp]
|
|
mov eax, [eax + 4*esi]
|
|
shld edi, eax, cl
|
|
jmp .break2 ; goto break2;
|
|
; }
|
|
.c2_next2: ; } else {
|
|
; in this case we're starting our read at a partial tail word;
|
|
; the reader has guaranteed that we have at least 'parameter'
|
|
; bits available to read, which makes this case simpler.
|
|
; uval <<= parameter;
|
|
; if(cbits) {
|
|
; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
|
|
; uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
|
|
; cbits += parameter;
|
|
; goto break2;
|
|
; } else {
|
|
; cbits = parameter;
|
|
; uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
|
|
; goto break2;
|
|
; }
|
|
; the above is much shorter in assembly:
|
|
mov eax, [ebp]
|
|
mov eax, [eax + 4*esi] ; eax <- br->buffer[cwords]
|
|
shl eax, cl ; eax <- br->buffer[cwords] << cbits
|
|
add ecx, ebx ; cbits += parameter
|
|
xchg ebx, ecx ; ebx <- cbits, ecx <- parameter
|
|
shld edi, eax, cl ; uval <<= parameter <<< 'parameter' bits of tail word
|
|
xchg ebx, ecx ; ebx <- parameter, ecx <- cbits
|
|
; }
|
|
; }
|
|
.break2:
|
|
sub [esp], ebx ; ucbits -= parameter;
|
|
|
|
;
|
|
; compose the value
|
|
;
|
|
mov ebx, [esp + 28] ; ebx <- vals
|
|
mov edx, edi ; edx <- uval
|
|
and edi, 1 ; edi <- uval & 1
|
|
shr edx, 1 ; edx <- uval >> 1
|
|
neg edi ; edi <- -(int)(uval & 1)
|
|
xor edx, edi ; edx <- (uval >> 1 ^ -(int)(uval & 1))
|
|
mov [ebx], edx ; *vals <- edx
|
|
sub dword [esp + 32], byte 1 ; --nvals;
|
|
jz .finished ; if(nvals == 0) /* jump to finish */
|
|
xor edi, edi ; uval = 0;
|
|
add dword [esp + 28], 4 ; ++vals
|
|
jmp .val_loop ; }
|
|
|
|
.finished:
|
|
mov [ebp + 16], esi ; br->consumed_words = cwords;
|
|
mov [ebp + 20], ecx ; br->consumed_bits = cbits;
|
|
mov eax, 1
|
|
.end:
|
|
add esp, 4
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
|
|
end
|
|
|
|
%ifdef OBJ_FORMAT_elf
|
|
section .note.GNU-stack noalloc
|
|
%endif
|