mirror of
https://github.com/ZDoom/Raze.git
synced 2024-12-11 05:11:07 +00:00
aa54d3af10
LZMA update plus several ZScript improvements.
506 lines
16 KiB
C
506 lines
16 KiB
C
/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
|
|
2023-04-02 : Igor Pavlov : Public domain */
|
|
|
|
#include "Precomp.h"
|
|
|
|
/* #define SHOW_STAT */
|
|
#ifdef SHOW_STAT
|
|
#include <stdio.h>
|
|
#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
|
|
#else
|
|
#define PRF2(s)
|
|
#endif
|
|
|
|
#include "Bcj2.h"
|
|
#include "CpuArch.h"
|
|
|
|
#define kTopValue ((UInt32)1 << 24)
|
|
#define kNumBitModelTotalBits 11
|
|
#define kBitModelTotal (1 << kNumBitModelTotalBits)
|
|
#define kNumMoveBits 5
|
|
|
|
void Bcj2Enc_Init(CBcj2Enc *p)
|
|
{
|
|
unsigned i;
|
|
p->state = BCJ2_ENC_STATE_ORIG;
|
|
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
|
|
p->context = 0;
|
|
p->flushRem = 5;
|
|
p->isFlushState = 0;
|
|
p->cache = 0;
|
|
p->range = 0xffffffff;
|
|
p->low = 0;
|
|
p->cacheSize = 1;
|
|
p->ip64 = 0;
|
|
p->fileIp64 = 0;
|
|
p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
|
|
p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
|
|
// p->relatExcludeBits = 0;
|
|
p->tempPos = 0;
|
|
for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
|
|
p->probs[i] = kBitModelTotal >> 1;
|
|
}
|
|
|
|
// Z7_NO_INLINE
|
|
Z7_FORCE_INLINE
|
|
static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
|
|
{
|
|
const UInt32 low = (UInt32)p->low;
|
|
const unsigned high = (unsigned)
|
|
#if defined(Z7_MSC_VER_ORIGINAL) \
|
|
&& defined(MY_CPU_X86) \
|
|
&& defined(MY_CPU_LE) \
|
|
&& !defined(MY_CPU_64BIT)
|
|
// we try to rid of __aullshr() call in MSVS-x86
|
|
(((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
|
|
#else
|
|
(p->low >> 32);
|
|
#endif
|
|
if (low < (UInt32)0xff000000 || high != 0)
|
|
{
|
|
Byte *buf = p->bufs[BCJ2_STREAM_RC];
|
|
do
|
|
{
|
|
if (buf == p->lims[BCJ2_STREAM_RC])
|
|
{
|
|
p->state = BCJ2_STREAM_RC;
|
|
p->bufs[BCJ2_STREAM_RC] = buf;
|
|
return True;
|
|
}
|
|
*buf++ = (Byte)(p->cache + high);
|
|
p->cache = 0xff;
|
|
}
|
|
while (--p->cacheSize);
|
|
p->bufs[BCJ2_STREAM_RC] = buf;
|
|
p->cache = (Byte)(low >> 24);
|
|
}
|
|
p->cacheSize++;
|
|
p->low = low << 8;
|
|
return False;
|
|
}
|
|
|
|
|
|
/*
|
|
We can use 2 alternative versions of code:
|
|
1) non-marker version:
|
|
Byte CBcj2Enc::context
|
|
Byte temp[8];
|
|
Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
|
|
Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
|
|
with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
|
|
|
|
2) marker version:
|
|
UInt32 CBcj2Enc::context
|
|
Byte CBcj2Enc::temp[4];
|
|
MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
|
|
it's allowed that
|
|
one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
|
|
and another call of Bcj2Enc_Encode_2() does offset conversion.
|
|
So different values of (fileIp) and (fileSize) are possible
|
|
in these different Bcj2Enc_Encode_2() calls.
|
|
|
|
Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
|
|
So we use non-marker version.
|
|
*/
|
|
|
|
/*
|
|
Corner cases with overlap in multi-block.
|
|
before v23: there was one corner case, where converted instruction
|
|
could start in one sub-stream and finish in next sub-stream.
|
|
If multi-block (solid) encoding is used,
|
|
and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
|
|
and (0f) is last byte of previous sub-stream
|
|
and (8x) is first byte of current sub-stream
|
|
then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
|
|
BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
|
|
if that offset meets limit requirements.
|
|
If encoder allows 32-bit offset conversion for such overlap case,
|
|
then the data in 3 uncompressed BCJ2 streams for some sub-stream
|
|
can depend from data of previous sub-stream.
|
|
That corner case is not big problem, and it's rare case.
|
|
Since v23.00 we do additional check to prevent conversions in such overlap cases.
|
|
*/
|
|
|
|
/*
|
|
Bcj2Enc_Encode_2() output variables at exit:
|
|
{
|
|
if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
|
|
{
|
|
it means that encoder needs more input data.
|
|
if (p->srcLim == p->src) at exit, then
|
|
{
|
|
(p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
|
|
all input data were read and processed, and we are ready for
|
|
new input data.
|
|
}
|
|
else
|
|
{
|
|
(p->srcLim != p->src)
|
|
(p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
|
|
The encoder have found e8/e9/0f_8x marker,
|
|
and p->src points to last byte of that marker,
|
|
Bcj2Enc_Encode_2() needs more input data to get totally
|
|
5 bytes (last byte of marker and 32-bit branch offset)
|
|
as continuous array starting from p->src.
|
|
(p->srcLim - p->src < 5) requirement is met after exit.
|
|
So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
|
|
Z7_NO_INLINE
|
|
static void Bcj2Enc_Encode_2(CBcj2Enc *p)
|
|
{
|
|
if (!p->isFlushState)
|
|
{
|
|
const Byte *src;
|
|
UInt32 v;
|
|
{
|
|
const unsigned state = p->state;
|
|
if (BCJ2_IS_32BIT_STREAM(state))
|
|
{
|
|
Byte *cur = p->bufs[state];
|
|
if (cur == p->lims[state])
|
|
return;
|
|
SetBe32a(cur, p->tempTarget)
|
|
p->bufs[state] = cur + 4;
|
|
}
|
|
}
|
|
p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
|
|
src = p->src;
|
|
v = p->context;
|
|
|
|
// #define WRITE_CONTEXT p->context = v; // for marker version
|
|
#define WRITE_CONTEXT p->context = (Byte)v;
|
|
#define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT
|
|
|
|
for (;;)
|
|
{
|
|
// const Byte *src;
|
|
// UInt32 v;
|
|
CBcj2Enc_ip_unsigned ip;
|
|
if (p->range < kTopValue)
|
|
{
|
|
// to reduce register pressure and code size: we save and restore local variables.
|
|
WRITE_CONTEXT_AND_SRC
|
|
if (Bcj2_RangeEnc_ShiftLow(p))
|
|
return;
|
|
p->range <<= 8;
|
|
src = p->src;
|
|
v = p->context;
|
|
}
|
|
// src = p->src;
|
|
// #define MARKER_FLAG ((UInt32)1 << 17)
|
|
// if ((v & MARKER_FLAG) == 0) // for marker version
|
|
{
|
|
const Byte *srcLim;
|
|
Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
|
|
{
|
|
const SizeT remSrc = (SizeT)(p->srcLim - src);
|
|
SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
|
|
if (rem >= remSrc)
|
|
rem = remSrc;
|
|
srcLim = src + rem;
|
|
}
|
|
/* p->context contains context of previous byte:
|
|
bits [0 : 7] : src[-1], if (src) was changed in this call
|
|
bits [8 : 31] : are undefined for non-marker version
|
|
*/
|
|
// v = p->context;
|
|
#define NUM_SHIFT_BITS 24
|
|
#define CONV_FLAG ((UInt32)1 << 16)
|
|
#define ONE_ITER { \
|
|
b = src[0]; \
|
|
*dest++ = (Byte)b; \
|
|
v = (v << NUM_SHIFT_BITS) | b; \
|
|
if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
|
|
if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
|
|
((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
|
|
src++; if (src == srcLim) { break; } }
|
|
|
|
if (src != srcLim)
|
|
for (;;)
|
|
{
|
|
/* clang can generate ineffective code with setne instead of two jcc instructions.
|
|
we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
|
|
unsigned b;
|
|
ONE_ITER
|
|
ONE_ITER
|
|
}
|
|
|
|
ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
|
|
p->bufs[BCJ2_STREAM_MAIN] = dest;
|
|
p->ip64 = ip;
|
|
|
|
if (src == srcLim)
|
|
{
|
|
WRITE_CONTEXT_AND_SRC
|
|
if (src != p->srcLim)
|
|
{
|
|
p->state = BCJ2_STREAM_MAIN;
|
|
return;
|
|
}
|
|
/* (p->src == p->srcLim)
|
|
(p->state == BCJ2_ENC_STATE_ORIG) */
|
|
if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
|
|
return;
|
|
/* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
|
|
// (p->flushRem == 5);
|
|
p->isFlushState = 1;
|
|
break;
|
|
}
|
|
src++;
|
|
// p->src = src;
|
|
}
|
|
// ip = p->ip; // for marker version
|
|
/* marker was found */
|
|
/* (v) contains marker that was found:
|
|
bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
|
|
: value of src[-2] : xx/xx/0f
|
|
bits [0 : 7] : value of src[-1] : e8/e9/8x
|
|
*/
|
|
{
|
|
{
|
|
#if NUM_SHIFT_BITS != 24
|
|
v &= ~(UInt32)CONV_FLAG;
|
|
#endif
|
|
// UInt32 relat = 0;
|
|
if ((SizeT)(p->srcLim - src) >= 4)
|
|
{
|
|
/*
|
|
if (relat != 0 || (Byte)v != 0xe8)
|
|
BoolInt isBigOffset = True;
|
|
*/
|
|
const UInt32 relat = GetUi32(src);
|
|
/*
|
|
#define EXCLUDE_FLAG ((UInt32)1 << 4)
|
|
#define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
|
|
if (p->relatExcludeBits != 0)
|
|
{
|
|
const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
|
|
isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
|
|
}
|
|
// isBigOffset = False; // for debug
|
|
*/
|
|
ip -= p->fileIp64;
|
|
// Use the following if check, if (ip) is 64-bit:
|
|
if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
|
|
if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
|
|
if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
|
|
v |= CONV_FLAG;
|
|
}
|
|
else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
|
|
{
|
|
// (p->srcLim - src < 4)
|
|
// /*
|
|
// for non-marker version
|
|
p->ip64--; // p->ip = ip - 1;
|
|
p->bufs[BCJ2_STREAM_MAIN]--;
|
|
src--;
|
|
v >>= NUM_SHIFT_BITS;
|
|
// (0 < p->srcLim - p->src <= 4)
|
|
// */
|
|
// v |= MARKER_FLAG; // for marker version
|
|
/* (p->state == BCJ2_ENC_STATE_ORIG) */
|
|
WRITE_CONTEXT_AND_SRC
|
|
return;
|
|
}
|
|
{
|
|
const unsigned c = ((v + 0x17) >> 6) & 1;
|
|
CBcj2Prob *prob = p->probs + (unsigned)
|
|
(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
|
|
/*
|
|
((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
|
|
((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));
|
|
*/
|
|
const unsigned ttt = *prob;
|
|
const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
|
|
if ((v & CONV_FLAG) == 0)
|
|
{
|
|
// static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
|
|
// v = (Byte)v; // for marker version
|
|
p->range = bound;
|
|
*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
|
|
// WRITE_CONTEXT_AND_SRC
|
|
continue;
|
|
}
|
|
p->low += bound;
|
|
p->range -= bound;
|
|
*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
|
|
}
|
|
// p->context = src[3];
|
|
{
|
|
// const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
|
|
const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
|
|
ip = p->ip64;
|
|
v = GetUi32(src); // relat
|
|
ip += 4;
|
|
p->ip64 = ip;
|
|
src += 4;
|
|
// p->src = src;
|
|
{
|
|
const UInt32 absol = (UInt32)ip + v;
|
|
Byte *cur = p->bufs[cj];
|
|
v >>= 24;
|
|
// WRITE_CONTEXT
|
|
if (cur == p->lims[cj])
|
|
{
|
|
p->state = cj;
|
|
p->tempTarget = absol;
|
|
WRITE_CONTEXT_AND_SRC
|
|
return;
|
|
}
|
|
SetBe32a(cur, absol)
|
|
p->bufs[cj] = cur + 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} // end of loop
|
|
}
|
|
|
|
for (; p->flushRem != 0; p->flushRem--)
|
|
if (Bcj2_RangeEnc_ShiftLow(p))
|
|
return;
|
|
p->state = BCJ2_ENC_STATE_FINISHED;
|
|
}
|
|
|
|
|
|
/*
|
|
BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
|
|
So base function Bcj2Enc_Encode_2()
|
|
in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
|
|
(p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
|
|
Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
|
|
so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
|
|
then (p->src == p->srcLim).
|
|
And the caller's code is simpler with Bcj2Enc_Encode().
|
|
*/
|
|
|
|
Z7_NO_INLINE
|
|
void Bcj2Enc_Encode(CBcj2Enc *p)
|
|
{
|
|
PRF2("\n----")
|
|
if (p->tempPos != 0)
|
|
{
|
|
/* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
|
|
unsigned extra = 0;
|
|
/* We will touch only minimal required number of bytes in input (src) stream.
|
|
So we will add input bytes from (src) stream to temp[] with step of 1 byte.
|
|
We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
|
|
in first loop iteration because
|
|
- previous call of Bcj2Enc_Encode() could use another (finishMode),
|
|
- previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
|
|
the case with full temp[] buffer (p->tempPos == 4) is possible here.
|
|
*/
|
|
for (;;)
|
|
{
|
|
// (0 < p->tempPos <= 5) // in non-marker version
|
|
/* p->src : the current src data position including extra bytes
|
|
that were copied to temp[] buffer in this call */
|
|
const Byte *src = p->src;
|
|
const Byte *srcLim = p->srcLim;
|
|
const EBcj2Enc_FinishMode finishMode = p->finishMode;
|
|
if (src != srcLim)
|
|
{
|
|
/* if there are some src data after the data copied to temp[],
|
|
then we use MODE_CONTINUE for temp data */
|
|
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
|
|
}
|
|
p->src = p->temp;
|
|
p->srcLim = p->temp + p->tempPos;
|
|
PRF2(" ")
|
|
Bcj2Enc_Encode_2(p);
|
|
{
|
|
const unsigned num = (unsigned)(p->src - p->temp);
|
|
const unsigned tempPos = p->tempPos - num;
|
|
unsigned i;
|
|
p->tempPos = tempPos;
|
|
for (i = 0; i < tempPos; i++)
|
|
p->temp[i] = p->temp[(SizeT)i + num];
|
|
// tempPos : number of bytes in temp buffer
|
|
p->src = src;
|
|
p->srcLim = srcLim;
|
|
p->finishMode = finishMode;
|
|
if (p->state != BCJ2_ENC_STATE_ORIG)
|
|
{
|
|
// (p->tempPos <= 4) // in non-marker version
|
|
/* if (the reason of exit from Bcj2Enc_Encode_2()
|
|
is not BCJ2_ENC_STATE_ORIG),
|
|
then we exit from Bcj2Enc_Encode() with same reason */
|
|
// optional code begin : we rollback (src) and tempPos, if it's possible:
|
|
if (extra >= tempPos)
|
|
extra = tempPos;
|
|
p->src = src - extra;
|
|
p->tempPos = tempPos - extra;
|
|
// optional code end : rollback of (src) and tempPos
|
|
return;
|
|
}
|
|
/* (p->tempPos <= 4)
|
|
(p->state == BCJ2_ENC_STATE_ORIG)
|
|
so encoder needs more data than in temp[] */
|
|
if (src == srcLim)
|
|
return; // src buffer has no more input data.
|
|
/* (src != srcLim)
|
|
so we can provide more input data from src for Bcj2Enc_Encode_2() */
|
|
if (extra >= tempPos)
|
|
{
|
|
/* (extra >= tempPos) means that temp buffer contains
|
|
only data from src buffer of this call.
|
|
So now we can encode without temp buffer */
|
|
p->src = src - tempPos; // rollback (src)
|
|
p->tempPos = 0;
|
|
break;
|
|
}
|
|
// we append one additional extra byte from (src) to temp[] buffer:
|
|
p->temp[tempPos] = *src;
|
|
p->tempPos = tempPos + 1;
|
|
// (0 < p->tempPos <= 5) // in non-marker version
|
|
p->src = src + 1;
|
|
extra++;
|
|
}
|
|
}
|
|
}
|
|
|
|
PRF2("++++")
|
|
// (p->tempPos == 0)
|
|
Bcj2Enc_Encode_2(p);
|
|
PRF2("====")
|
|
|
|
if (p->state == BCJ2_ENC_STATE_ORIG)
|
|
{
|
|
const Byte *src = p->src;
|
|
const Byte *srcLim = p->srcLim;
|
|
const unsigned rem = (unsigned)(srcLim - src);
|
|
/* (rem <= 4) here.
|
|
if (p->src != p->srcLim), then
|
|
- we copy non-processed bytes from (p->src) to temp[] buffer,
|
|
- we set p->src equal to p->srcLim.
|
|
*/
|
|
if (rem)
|
|
{
|
|
unsigned i = 0;
|
|
p->src = srcLim;
|
|
p->tempPos = rem;
|
|
// (0 < p->tempPos <= 4)
|
|
do
|
|
p->temp[i] = src[i];
|
|
while (++i != rem);
|
|
}
|
|
// (p->tempPos <= 4)
|
|
// (p->src == p->srcLim)
|
|
}
|
|
}
|
|
|
|
#undef PRF2
|
|
#undef CONV_FLAG
|
|
#undef MARKER_FLAG
|
|
#undef WRITE_CONTEXT
|
|
#undef WRITE_CONTEXT_AND_SRC
|
|
#undef ONE_ITER
|
|
#undef NUM_SHIFT_BITS
|
|
#undef kTopValue
|
|
#undef kNumBitModelTotalBits
|
|
#undef kBitModelTotal
|
|
#undef kNumMoveBits
|