mirror of
https://github.com/etlegacy/etlegacy-libs.git
synced 2025-04-10 00:11:19 +00:00
libs: updated to libjpeg-turbo 2.0.4
This commit is contained in:
parent
3196e33edf
commit
86b612bda1
128 changed files with 580 additions and 790 deletions
|
@ -15,13 +15,18 @@ Build Requirements
|
|||
* If using NASM, 2.10 or later is required.
|
||||
* If using NASM, 2.10 or later (except 2.11.08) is required for an x86-64 Mac
|
||||
build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code
|
||||
when building macho64 objects.) NASM or YASM can be obtained from
|
||||
[MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
|
||||
when building macho64 objects.)
|
||||
* If using YASM, 1.2.0 or later is required.
|
||||
* If building on macOS, NASM or YASM can be obtained from
|
||||
[MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
|
||||
- NOTE: Currently, if it is desirable to hide the SIMD function symbols in
|
||||
Mac executables or shared libraries that statically link with
|
||||
libjpeg-turbo, then YASM must be used when building libjpeg-turbo.
|
||||
libjpeg-turbo, then NASM 2.14 or later or YASM must be used when
|
||||
building libjpeg-turbo.
|
||||
* If building on Windows, **nasm.exe**/**yasm.exe** should be in your `PATH`.
|
||||
* NASM and YASM are located in the CRB (Code Ready Builder) repository on
|
||||
Red Hat Enterprise Linux 8 and in the PowerTools repository on CentOS 8,
|
||||
which is not enabled by default.
|
||||
|
||||
The binary RPMs released by the NASM project do not work on older Linux
|
||||
systems, such as Red Hat Enterprise Linux 5. On such systems, you can easily
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
brew 'yasm'
|
||||
brew 'gcc@5'
|
||||
brew 'md5sha1sum'
|
||||
cask 'Caskroom/versions/java6'
|
|
@ -5,7 +5,7 @@ if(CMAKE_EXECUTABLE_SUFFIX)
|
|||
endif()
|
||||
|
||||
project(libjpeg-turbo C)
|
||||
set(VERSION 2.0.3)
|
||||
set(VERSION 2.0.4)
|
||||
string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
|
||||
list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
|
||||
list(GET VERSION_TRIPLET 1 VERSION_MINOR)
|
||||
|
|
|
@ -1,3 +1,44 @@
|
|||
2.0.4
|
||||
=====
|
||||
|
||||
### Significant changes relative to 2.0.3:
|
||||
|
||||
1. Fixed a regression in the Windows packaging system (introduced by
|
||||
2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
|
||||
64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
|
||||
one of them could be uninstalled.
|
||||
|
||||
2. Fixed a signed integer overflow and subsequent segfault that occurred when
|
||||
attempting to decompress images with more than 715827882 pixels using the
|
||||
64-bit C version of TJBench.
|
||||
|
||||
3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
|
||||
`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
|
||||
occurred when attempting to decompress grayscale JPEG images that were
|
||||
compressed with a sampling factor other than 1 (for instance, with
|
||||
`cjpeg -grayscale -sample 2x2`).
|
||||
|
||||
4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
|
||||
incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
|
||||
JPEG images. This was known to cause a buffer overflow when attempting to
|
||||
decompress some such images using `tjDecompressToYUV2()` or
|
||||
`tjDecompressToYUVPlanes()`.
|
||||
|
||||
5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
|
||||
a specially-crafted malformed JPEG image containing an extremely-high-frequency
|
||||
coefficient block (junk image data that could never be generated by a
|
||||
legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
|
||||
be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].) Given that the buffer
|
||||
overrun was fully contained within the stack and did not cause a segfault or
|
||||
other user-visible errant behavior, and given that the lossless transformer
|
||||
(unlike the decompressor) is not generally exposed to arbitrary data exploits,
|
||||
this issue did not likely pose a security risk.
|
||||
|
||||
6. The ARM 64-bit (ARMv8) NEON SIMD assembly code now stores constants in a
|
||||
separate read-only data section rather than in the text section, to support
|
||||
execute-only memory layouts.
|
||||
|
||||
|
||||
2.0.3
|
||||
=====
|
||||
|
||||
|
@ -138,10 +179,11 @@ would produce a "Bogus message code" error message if the underlying bitmap and
|
|||
PPM readers/writers threw an error that was specific to the readers/writers
|
||||
(as opposed to a general libjpeg API error.)
|
||||
|
||||
4. Fixed an issue whereby a specially-crafted malformed BMP file, one in which
|
||||
the header specified an image width of 1073741824 pixels, would trigger a
|
||||
floating point exception (division by zero) in the `tjLoadImage()` function
|
||||
when attempting to load the BMP file into a 4-component image buffer.
|
||||
4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
|
||||
file, one in which the header specified an image width of 1073741824 pixels,
|
||||
would trigger a floating point exception (division by zero) in the
|
||||
`tjLoadImage()` function when attempting to load the BMP file into a
|
||||
4-component image buffer.
|
||||
|
||||
5. Fixed an issue whereby certain combinations of calls to
|
||||
`jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
|
||||
|
@ -155,10 +197,10 @@ a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
|
|||
7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
|
||||
extensions if it detects that the compiler does not support DSPr2 instructions.
|
||||
|
||||
8. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
|
||||
a specially-crafted malformed color-index (8-bit-per-sample) BMP file in which
|
||||
some of the samples (color indices) exceeded the bounds of the BMP file's color
|
||||
table.
|
||||
8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
|
||||
attempting to compress a specially-crafted malformed color-index
|
||||
(8-bit-per-sample) BMP file in which some of the samples (color indices)
|
||||
exceeded the bounds of the BMP file's color table.
|
||||
|
||||
9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
|
||||
by the Clang and GCC undefined behavior sanitizers, that could be triggered by
|
||||
|
@ -318,8 +360,8 @@ write scanlines in bottom-up order.) djpeg will now exit gracefully if an
|
|||
output format other than PPM/PGM, GIF, or Targa is selected along with the
|
||||
`-crop` option.
|
||||
|
||||
4. Fixed an issue whereby `jpeg_skip_scanlines()` would segfault if color
|
||||
quantization was enabled.
|
||||
4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
|
||||
segfault if color quantization was enabled.
|
||||
|
||||
5. TJBench (both C and Java versions) will now display usage information if any
|
||||
command-line argument is unrecognized. This prevents the program from silently
|
||||
|
@ -946,13 +988,13 @@ and IDCT algorithms (both are used during JPEG decompression.) For unknown
|
|||
reasons (probably related to clang), this code cannot currently be compiled for
|
||||
iOS.
|
||||
|
||||
15. Fixed an extremely rare bug that could cause the Huffman encoder's local
|
||||
buffer to overrun when a very high-frequency MCU is compressed using quality
|
||||
100 and no subsampling, and when the JPEG output buffer is being dynamically
|
||||
resized by the destination manager. This issue was so rare that, even with a
|
||||
test program specifically designed to make the bug occur (by injecting random
|
||||
high-frequency YUV data into the compressor), it was reproducible only once in
|
||||
about every 25 million iterations.
|
||||
15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
|
||||
encoder's local buffer to overrun when a very high-frequency MCU is compressed
|
||||
using quality 100 and no subsampling, and when the JPEG output buffer is being
|
||||
dynamically resized by the destination manager. This issue was so rare that,
|
||||
even with a test program specifically designed to make the bug occur (by
|
||||
injecting random high-frequency YUV data into the compressor), it was
|
||||
reproducible only once in about every 25 million iterations.
|
||||
|
||||
16. Fixed an oversight in the TurboJPEG C wrapper: if any of the JPEG
|
||||
compression functions was called repeatedly with the same
|
||||
|
@ -987,8 +1029,9 @@ entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
|
|||
jpegtran, for instance) would result in an error, `Requested feature was
|
||||
omitted at compile time`.
|
||||
|
||||
4. Fixed a couple of issues whereby malformed JPEG images would cause
|
||||
libjpeg-turbo to use uninitialized memory during decompression.
|
||||
4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
|
||||
JPEG images would cause libjpeg-turbo to use uninitialized memory during
|
||||
decompression.
|
||||
|
||||
5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
|
||||
when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
|
||||
|
@ -1127,9 +1170,9 @@ correct behavior of the colorspace extensions when merged upsampling is used.
|
|||
upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
|
||||
calling conventions.
|
||||
|
||||
4. Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
|
||||
images (specifically, images in which the component count was erroneously set
|
||||
to a large value) would cause libjpeg-turbo to segfault.
|
||||
4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
|
||||
corrupt JPEG images (specifically, images in which the component count was
|
||||
erroneously set to a large value) would cause libjpeg-turbo to segfault.
|
||||
|
||||
5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
|
||||
processors. The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
|
||||
|
|
34
jpegturbo/README.md
Executable file → Normal file
34
jpegturbo/README.md
Executable file → Normal file
|
@ -1,14 +1,14 @@
|
|||
Background
|
||||
==========
|
||||
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
|
||||
AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
|
||||
on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
|
||||
compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is
|
||||
generally 2-6x as fast as libjpeg, all else being equal. On other types of
|
||||
systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
|
||||
virtue of its highly-optimized Huffman coding routines. In many cases, the
|
||||
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
|
||||
baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
|
||||
MIPS systems, as well as progressive JPEG compression on x86 and x86-64
|
||||
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
|
||||
all else being equal. On other types of systems, libjpeg-turbo can still
|
||||
outperform libjpeg by a significant amount, by virtue of its highly-optimized
|
||||
Huffman coding routines. In many cases, the performance of libjpeg-turbo
|
||||
rivals that of proprietary high-speed JPEG codecs.
|
||||
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less
|
||||
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
|
||||
|
@ -145,14 +145,14 @@ supported and which aren't.
|
|||
|
||||
#### Fully supported
|
||||
|
||||
- **libjpeg: IDCT scaling extensions in decompressor**<br>
|
||||
- **libjpeg API: IDCT scaling extensions in decompressor**<br>
|
||||
libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
|
||||
1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
|
||||
and 1/2 are SIMD-accelerated.)
|
||||
|
||||
- **libjpeg: Arithmetic coding**
|
||||
- **libjpeg API: Arithmetic coding**
|
||||
|
||||
- **libjpeg: In-memory source and destination managers**<br>
|
||||
- **libjpeg API: In-memory source and destination managers**<br>
|
||||
See notes below.
|
||||
|
||||
- **cjpeg: Separate quality settings for luminance and chrominance**<br>
|
||||
|
@ -184,14 +184,14 @@ means of quality improvement. The reader is invited to peruse the research at
|
|||
but it is the general belief of our project that these features have not
|
||||
demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
|
||||
|
||||
- **libjpeg: DCT scaling in compressor**<br>
|
||||
- **libjpeg API: DCT scaling in compressor**<br>
|
||||
`cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
|
||||
There is no technical reason why DCT scaling could not be supported when
|
||||
emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
|
||||
below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
|
||||
8/9 would be available, which is of limited usefulness.
|
||||
|
||||
- **libjpeg: SmartScale**<br>
|
||||
- **libjpeg API: SmartScale**<br>
|
||||
`cinfo.block_size` is silently ignored.
|
||||
SmartScale is an extension to the JPEG format that allows for DCT block
|
||||
sizes other than 8x8. Providing support for this new format would be
|
||||
|
@ -204,7 +204,7 @@ demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
|
|||
interest in providing this feature would be as a means of supporting
|
||||
additional DCT scaling factors.
|
||||
|
||||
- **libjpeg: Fancy downsampling in compressor**<br>
|
||||
- **libjpeg API: Fancy downsampling in compressor**<br>
|
||||
`cinfo.do_fancy_downsampling` is silently ignored.
|
||||
This requires the DCT scaling feature, which is not supported.
|
||||
|
||||
|
@ -252,8 +252,8 @@ building libjpeg-turbo. This will restore the pre-1.3 behavior, in which
|
|||
libjpeg v8 API/ABI.
|
||||
|
||||
On Un*x systems, including the in-memory source/destination managers changes
|
||||
the dynamic library version from 62.1.0 to 62.2.0 if using libjpeg v6b API/ABI
|
||||
emulation and from 7.1.0 to 7.2.0 if using libjpeg v7 API/ABI emulation.
|
||||
the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
|
||||
emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
|
||||
|
||||
Note that, on most Un*x systems, the dynamic linker will not look for a
|
||||
function in a library until that function is actually used. Thus, if a program
|
||||
|
@ -329,7 +329,7 @@ in a way that makes the rest of the libjpeg infrastructure happy, so it is
|
|||
necessary to use the slow Huffman decoder when decompressing a JPEG image that
|
||||
has restart markers. This can cause the decompression performance to drop by
|
||||
as much as 20%, but the performance will still be much greater than that of
|
||||
libjpeg. Many consumer packages, such as PhotoShop, use restart markers when
|
||||
libjpeg. Many consumer packages, such as Photoshop, use restart markers when
|
||||
generating JPEG images, so images generated by those programs will experience
|
||||
this issue.
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ endif()
|
|||
if(BITS EQUAL 64)
|
||||
set(INST_PLATFORM "${INST_PLATFORM} 64-bit")
|
||||
set(INST_NAME ${INST_NAME}64)
|
||||
set(INST_REG_NAME ${INST_DIR}64)
|
||||
set(INST_REG_NAME ${INST_REG_NAME}64)
|
||||
set(INST_DEFS ${INST_DEFS} -DWIN64)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -516,7 +516,9 @@ main(int argc, char **argv)
|
|||
FILE *input_file;
|
||||
FILE *output_file;
|
||||
unsigned char *inbuffer = NULL;
|
||||
#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
|
||||
unsigned long insize = 0;
|
||||
#endif
|
||||
JDIMENSION num_scanlines;
|
||||
|
||||
/* On Mac, fetch a command line. */
|
||||
|
|
|
@ -288,12 +288,14 @@ my_error_exit(j_common_ptr cinfo)
|
|||
}
|
||||
|
||||
|
||||
METHODDEF(int) do_read_JPEG_file(struct jpeg_decompress_struct *cinfo,
|
||||
char *filename);
|
||||
|
||||
/*
|
||||
* Sample routine for JPEG decompression. We assume that the source file name
|
||||
* is passed in. We want to return 1 on success, 0 on error.
|
||||
*/
|
||||
|
||||
|
||||
GLOBAL(int)
|
||||
read_JPEG_file(char *filename)
|
||||
{
|
||||
|
@ -301,6 +303,21 @@ read_JPEG_file(char *filename)
|
|||
* working space (which is allocated as needed by the JPEG library).
|
||||
*/
|
||||
struct jpeg_decompress_struct cinfo;
|
||||
|
||||
return do_read_JPEG_file(&cinfo, filename);
|
||||
}
|
||||
|
||||
/*
|
||||
* We call the libjpeg API from within a separate function, because modifying
|
||||
* the local non-volatile jpeg_decompress_struct instance below the setjmp()
|
||||
* return point and then accessing the instance after setjmp() returns would
|
||||
* return in undefined behavior that may potentially overwrite all or part of
|
||||
* the structure.
|
||||
*/
|
||||
|
||||
METHODDEF(int)
|
||||
do_read_JPEG_file(struct jpeg_decompress_struct *cinfo, char *filename)
|
||||
{
|
||||
/* We use our private extension JPEG error handler.
|
||||
* Note that this struct must live as long as the main JPEG parameter
|
||||
* struct, to avoid dangling-pointer problems.
|
||||
|
@ -325,27 +342,27 @@ read_JPEG_file(char *filename)
|
|||
/* Step 1: allocate and initialize JPEG decompression object */
|
||||
|
||||
/* We set up the normal JPEG error routines, then override error_exit. */
|
||||
cinfo.err = jpeg_std_error(&jerr.pub);
|
||||
cinfo->err = jpeg_std_error(&jerr.pub);
|
||||
jerr.pub.error_exit = my_error_exit;
|
||||
/* Establish the setjmp return context for my_error_exit to use. */
|
||||
if (setjmp(jerr.setjmp_buffer)) {
|
||||
/* If we get here, the JPEG code has signaled an error.
|
||||
* We need to clean up the JPEG object, close the input file, and return.
|
||||
*/
|
||||
jpeg_destroy_decompress(&cinfo);
|
||||
jpeg_destroy_decompress(cinfo);
|
||||
fclose(infile);
|
||||
return 0;
|
||||
}
|
||||
/* Now we can initialize the JPEG decompression object. */
|
||||
jpeg_create_decompress(&cinfo);
|
||||
jpeg_create_decompress(cinfo);
|
||||
|
||||
/* Step 2: specify data source (eg, a file) */
|
||||
|
||||
jpeg_stdio_src(&cinfo, infile);
|
||||
jpeg_stdio_src(cinfo, infile);
|
||||
|
||||
/* Step 3: read file parameters with jpeg_read_header() */
|
||||
|
||||
(void)jpeg_read_header(&cinfo, TRUE);
|
||||
(void)jpeg_read_header(cinfo, TRUE);
|
||||
/* We can ignore the return value from jpeg_read_header since
|
||||
* (a) suspension is not possible with the stdio data source, and
|
||||
* (b) we passed TRUE to reject a tables-only JPEG file as an error.
|
||||
|
@ -360,7 +377,7 @@ read_JPEG_file(char *filename)
|
|||
|
||||
/* Step 5: Start decompressor */
|
||||
|
||||
(void)jpeg_start_decompress(&cinfo);
|
||||
(void)jpeg_start_decompress(cinfo);
|
||||
/* We can ignore the return value since suspension is not possible
|
||||
* with the stdio data source.
|
||||
*/
|
||||
|
@ -372,30 +389,30 @@ read_JPEG_file(char *filename)
|
|||
* In this example, we need to make an output work buffer of the right size.
|
||||
*/
|
||||
/* JSAMPLEs per row in output buffer */
|
||||
row_stride = cinfo.output_width * cinfo.output_components;
|
||||
row_stride = cinfo->output_width * cinfo->output_components;
|
||||
/* Make a one-row-high sample array that will go away when done with image */
|
||||
buffer = (*cinfo.mem->alloc_sarray)
|
||||
((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1);
|
||||
buffer = (*cinfo->mem->alloc_sarray)
|
||||
((j_common_ptr)cinfo, JPOOL_IMAGE, row_stride, 1);
|
||||
|
||||
/* Step 6: while (scan lines remain to be read) */
|
||||
/* jpeg_read_scanlines(...); */
|
||||
|
||||
/* Here we use the library's state variable cinfo.output_scanline as the
|
||||
/* Here we use the library's state variable cinfo->output_scanline as the
|
||||
* loop counter, so that we don't have to keep track ourselves.
|
||||
*/
|
||||
while (cinfo.output_scanline < cinfo.output_height) {
|
||||
while (cinfo->output_scanline < cinfo->output_height) {
|
||||
/* jpeg_read_scanlines expects an array of pointers to scanlines.
|
||||
* Here the array is only one element long, but you could ask for
|
||||
* more than one scanline at a time if that's more convenient.
|
||||
*/
|
||||
(void)jpeg_read_scanlines(&cinfo, buffer, 1);
|
||||
(void)jpeg_read_scanlines(cinfo, buffer, 1);
|
||||
/* Assume put_scanline_someplace wants a pointer and sample count. */
|
||||
put_scanline_someplace(buffer[0], row_stride);
|
||||
}
|
||||
|
||||
/* Step 7: Finish decompression */
|
||||
|
||||
(void)jpeg_finish_decompress(&cinfo);
|
||||
(void)jpeg_finish_decompress(cinfo);
|
||||
/* We can ignore the return value since suspension is not possible
|
||||
* with the stdio data source.
|
||||
*/
|
||||
|
@ -403,7 +420,7 @@ read_JPEG_file(char *filename)
|
|||
/* Step 8: Release JPEG decompression object */
|
||||
|
||||
/* This is an important step since it will release a good deal of memory. */
|
||||
jpeg_destroy_decompress(&cinfo);
|
||||
jpeg_destroy_decompress(cinfo);
|
||||
|
||||
/* After finish_decompress, we can close the input file.
|
||||
* Here we postpone it until after no more JPEG errors are possible,
|
||||
|
|
|
@ -478,6 +478,8 @@ final class TJBench {
|
|||
if (!compOnly)
|
||||
decomp(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
|
||||
fileName, tilew, tileh);
|
||||
else if (quiet == 1)
|
||||
System.out.println("N/A");
|
||||
|
||||
if (tilew == w && tileh == h) break;
|
||||
}
|
||||
|
|
|
@ -43,8 +43,8 @@
|
|||
*/
|
||||
|
||||
/* NOTE: Both GCC and Clang define __GNUC__ */
|
||||
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||
#if !defined __thumb__ || defined __thumb2__
|
||||
#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
|
||||
#if !defined(__thumb__) || defined(__thumb2__)
|
||||
#define USE_CLZ_INTRINSIC
|
||||
#endif
|
||||
#endif
|
||||
|
@ -432,7 +432,7 @@ dump_buffer(working_state *state)
|
|||
* scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
|
||||
* larger than 200 bytes.
|
||||
*/
|
||||
#define BUFSIZE (DCTSIZE2 * 4)
|
||||
#define BUFSIZE (DCTSIZE2 * 8)
|
||||
|
||||
#define LOAD_BUFFER() { \
|
||||
if (state->free_in_buffer < BUFSIZE) { \
|
||||
|
|
|
@ -1,73 +0,0 @@
|
|||
/* Version ID for the JPEG library.
|
||||
* Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
|
||||
*/
|
||||
#define JPEG_LIB_VERSION 80
|
||||
|
||||
/* libjpeg-turbo version */
|
||||
#define LIBJPEG_TURBO_VERSION 2.0.3
|
||||
|
||||
/* libjpeg-turbo version in integer form */
|
||||
#define LIBJPEG_TURBO_VERSION_NUMBER 203
|
||||
|
||||
/* Support arithmetic encoding */
|
||||
#define C_ARITH_CODING_SUPPORTED 1
|
||||
|
||||
/* Support arithmetic decoding */
|
||||
#define D_ARITH_CODING_SUPPORTED 1
|
||||
|
||||
/* Support in-memory source/destination managers */
|
||||
#define MEM_SRCDST_SUPPORTED 1
|
||||
|
||||
/* Use accelerated SIMD routines. */
|
||||
#cmakedefine WITH_SIMD 1
|
||||
|
||||
/*
|
||||
* Define BITS_IN_JSAMPLE as either
|
||||
* 8 for 8-bit sample values (the usual setting)
|
||||
* 12 for 12-bit sample values
|
||||
* Only 8 and 12 are legal data precisions for lossy JPEG according to the
|
||||
* JPEG standard, and the IJG code does not support anything else!
|
||||
* We do not support run-time selection of data precision, sorry.
|
||||
*/
|
||||
|
||||
#define BITS_IN_JSAMPLE 8 /* use 8 or 12 */
|
||||
|
||||
/* Define to 1 if you have the <locale.h> header file. */
|
||||
#cmakedefine HAVE_LOCALE_H 1
|
||||
|
||||
/* Define to 1 if you have the <stddef.h> header file. */
|
||||
#cmakedefine HAVE_STDDEF_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#cmakedefine HAVE_STDLIB_H 1
|
||||
|
||||
/* Define if you need to include <sys/types.h> to get size_t. */
|
||||
#cmakedefine NEED_SYS_TYPES_H 1
|
||||
|
||||
/* Define if you have BSD-like bzero and bcopy in <strings.h> rather than
|
||||
memset/memcpy in <string.h>. */
|
||||
#cmakedefine NEED_BSD_STRINGS 1
|
||||
|
||||
/* Define to 1 if the system has the type `unsigned char'. */
|
||||
#define HAVE_UNSIGNED_CHAR 1
|
||||
|
||||
/* Define to 1 if the system has the type `unsigned short'. */
|
||||
#define HAVE_UNSIGNED_SHORT 1
|
||||
|
||||
/* Compiler does not support pointers to undefined structures. */
|
||||
#cmakedefine INCOMPLETE_TYPES_BROKEN 1
|
||||
|
||||
/* Define if your (broken) compiler shifts signed values as if they were
|
||||
unsigned. */
|
||||
#cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
|
||||
|
||||
/* Define to 1 if type `char' is unsigned and you are not using gcc. */
|
||||
#ifndef __CHAR_UNSIGNED__
|
||||
#cmakedefine __CHAR_UNSIGNED__ 1
|
||||
#endif
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
/* #undef size_t */
|
|
@ -1,31 +0,0 @@
|
|||
/* libjpeg-turbo build number */
|
||||
#define BUILD "0"
|
||||
|
||||
/* Compiler's inline keyword */
|
||||
#undef inline
|
||||
|
||||
/* How to obtain function inlining. */
|
||||
#define INLINE __inline
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#define PACKAGE_NAME "libjpeg-turbo"
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "2.0.3"
|
||||
|
||||
/* The size of `size_t', as computed by sizeof. */
|
||||
#cmakedefine SIZEOF_SIZE_T @SIZEOF_SIZE_T@
|
||||
|
||||
/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
|
||||
#cmakedefine HAVE_BUILTIN_CTZL
|
||||
|
||||
/* Define to 1 if you have the <intrin.h> header file. */
|
||||
#cmakedefine HAVE_INTRIN_H
|
||||
|
||||
#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
|
||||
#if (SIZEOF_SIZE_T == 8)
|
||||
#define HAVE_BITSCANFORWARD64
|
||||
#elif (SIZEOF_SIZE_T == 4)
|
||||
#define HAVE_BITSCANFORWARD
|
||||
#endif
|
||||
#endif
|
|
@ -52,8 +52,8 @@
|
|||
*/
|
||||
|
||||
/* NOTE: Both GCC and Clang define __GNUC__ */
|
||||
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||
#if !defined __thumb__ || defined __thumb2__
|
||||
#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
|
||||
#if !defined(__thumb__) || defined(__thumb2__)
|
||||
#define USE_CLZ_INTRINSIC
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* jfdctint.c
|
||||
*
|
||||
* This file was part of the Independent JPEG Group's software.
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Copyright (C) 1991-1996, Thomas G. Lane.
|
||||
* libjpeg-turbo Modifications:
|
||||
* Copyright (C) 2015, D. R. Commander.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* jidctint.c
|
||||
*
|
||||
* This file was part of the Independent JPEG Group's software.
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Copyright (C) 1991-1998, Thomas G. Lane.
|
||||
* Modification developed 2002-2009 by Guido Vollbeding.
|
||||
* libjpeg-turbo Modifications:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* jidctred.c
|
||||
*
|
||||
* This file was part of the Independent JPEG Group's software.
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Copyright (C) 1994-1998, Thomas G. Lane.
|
||||
* libjpeg-turbo Modifications:
|
||||
* Copyright (C) 2015, D. R. Commander.
|
||||
|
|
0
jpegturbo/release/License.rtf
Executable file → Normal file
0
jpegturbo/release/License.rtf
Executable file → Normal file
|
@ -1,4 +1,4 @@
|
|||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API. libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
|
||||
|
||||
|
|
0
jpegturbo/release/Welcome.rtf
Executable file → Normal file
0
jpegturbo/release/Welcome.rtf
Executable file → Normal file
|
@ -8,15 +8,14 @@ Maintainer: @PKGVENDOR@ <@PKGEMAIL@>
|
|||
Homepage: @PKGURL@
|
||||
Installed-Size: {__SIZE}
|
||||
Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
|
||||
AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
|
||||
on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
|
||||
compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is
|
||||
generally 2-6x as fast as libjpeg, all else being equal. On other types of
|
||||
systems, libjpeg-turbo can still outperform libjpeg by a significant amount,
|
||||
by virtue of its highly-optimized Huffman coding routines. In many cases, the
|
||||
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG
|
||||
codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
|
||||
baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
|
||||
MIPS systems, as well as progressive JPEG compression on x86 and x86-64
|
||||
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
|
||||
all else being equal. On other types of systems, libjpeg-turbo can still
|
||||
outperform libjpeg by a significant amount, by virtue of its highly-optimized
|
||||
Huffman coding routines. In many cases, the performance of libjpeg-turbo
|
||||
rivals that of proprietary high-speed JPEG codecs.
|
||||
.
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less
|
||||
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
|
||||
|
|
0
jpegturbo/release/makecygwinpkg.in
Executable file → Normal file
0
jpegturbo/release/makecygwinpkg.in
Executable file → Normal file
|
@ -51,14 +51,14 @@ Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{r
|
|||
%endif
|
||||
|
||||
%description
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
|
||||
AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
|
||||
on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
|
||||
compression on x86 and x86-64 systems. On such systems, libjpeg-turbo is
|
||||
generally 2-6x as fast as libjpeg, all else being equal. On other types of
|
||||
systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
|
||||
virtue of its highly-optimized Huffman coding routines. In many cases, the
|
||||
performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
|
||||
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
|
||||
baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
|
||||
MIPS systems, as well as progressive JPEG compression on x86 and x86-64
|
||||
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
|
||||
all else being equal. On other types of systems, libjpeg-turbo can still
|
||||
outperform libjpeg by a significant amount, by virtue of its highly-optimized
|
||||
Huffman coding routines. In many cases, the performance of libjpeg-turbo
|
||||
rivals that of proprietary high-speed JPEG codecs.
|
||||
|
||||
libjpeg-turbo implements both the traditional libjpeg API as well as the less
|
||||
powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
|
||||
|
|
0
jpegturbo/sharedlib/CMakeLists.txt
Executable file → Normal file
0
jpegturbo/sharedlib/CMakeLists.txt
Executable file → Normal file
8
jpegturbo/simd/CMakeLists.txt
Executable file → Normal file
8
jpegturbo/simd/CMakeLists.txt
Executable file → Normal file
|
@ -38,6 +38,14 @@ elseif(CPU_TYPE STREQUAL "i386")
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT REQUIRE_SIMD)
|
||||
include(CheckLanguage)
|
||||
check_language(ASM_NASM)
|
||||
if(NOT CMAKE_ASM_NASM_COMPILER)
|
||||
simd_fail("SIMD extensions disabled: could not find NASM compiler")
|
||||
return()
|
||||
endif()
|
||||
endif()
|
||||
enable_language(ASM_NASM)
|
||||
message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
|
||||
|
||||
|
|
|
@ -31,6 +31,251 @@
|
|||
.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
.section __DATA, __const
|
||||
#else
|
||||
.section .rodata, "a", %progbits
|
||||
#endif
|
||||
|
||||
/* Constants for jsimd_idct_islow_neon() */
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
/* Constants for jsimd_idct_ifast_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
|
||||
|
||||
/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
|
||||
|
||||
#define CONST_BITS 13
|
||||
|
||||
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
|
||||
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
|
||||
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
|
||||
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
|
||||
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
|
||||
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
|
||||
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
|
||||
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
|
||||
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
|
||||
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
|
||||
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
|
||||
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
|
||||
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.h[0] */
|
||||
.short -FIX_0_765366865 /* v0.h[1] */
|
||||
.short -FIX_0_211164243 /* v0.h[2] */
|
||||
.short FIX_1_451774981 /* v0.h[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* v2.h[0] */
|
||||
.short FIX_2_562915447 /* v2.h[1] */
|
||||
.short 1 << (CONST_BITS + 1) /* v2.h[2] */
|
||||
.short 0 /* v2.h[3] */
|
||||
|
||||
.balign 8
|
||||
Ljsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* v14[0] */
|
||||
.short FIX_0_850430095 /* v14[1] */
|
||||
.short -FIX_1_272758580 /* v14[2] */
|
||||
.short FIX_3_624509785 /* v14[3] */
|
||||
|
||||
/* Constants for jsimd_ycc_*_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_ycc_rgb_neon_consts:
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
.short -128, -128, -128, -128
|
||||
|
||||
/* Constants for jsimd_*_ycc_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_rgb_ycc_neon_consts:
|
||||
.short 19595, 38470, 7471, 11059
|
||||
.short 21709, 32768, 27439, 5329
|
||||
.short 32767, 128, 32767, 128
|
||||
.short 32767, 128, 32767, 128
|
||||
|
||||
/* Constants for jsimd_fdct_islow_neon() */
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
/* Constants for jsimd_fdct_ifast_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_ifast_neon_consts:
|
||||
.short (98 * 128) /* XFIX_0_382683433 */
|
||||
.short (139 * 128) /* XFIX_0_541196100 */
|
||||
.short (181 * 128) /* XFIX_0_707106781 */
|
||||
.short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
|
||||
|
||||
/* Constants for jsimd_h2*_downsample_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_h2_downsample_neon_consts:
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
|
||||
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
|
||||
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
|
||||
.byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
|
||||
.byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
|
||||
|
||||
/* Constants for jsimd_huff_encode_one_block_neon() */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_huff_encode_one_block_neon_consts:
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
.byte 0, 1, 2, 3, 16, 17, 32, 33, \
|
||||
18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
|
||||
.byte 34, 35, 48, 49, 255, 255, 50, 51, \
|
||||
36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
|
||||
.byte 8, 9, 22, 23, 36, 37, 50, 51, \
|
||||
255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
|
||||
.byte 54, 55, 40, 41, 26, 27, 12, 13, \
|
||||
14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
|
||||
.byte 6, 7, 20, 21, 34, 35, 48, 49, \
|
||||
50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
|
||||
.byte 42, 43, 28, 29, 14, 15, 30, 31, \
|
||||
44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
|
||||
.byte 255, 255, 255, 255, 56, 57, 42, 43, \
|
||||
28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
|
||||
.byte 26, 27, 40, 41, 42, 43, 28, 29, \
|
||||
14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
|
||||
.byte 255, 255, 255, 255, 0, 1, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
|
||||
.byte 4, 5, 6, 7, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
|
||||
|
||||
.text
|
||||
|
||||
|
||||
|
@ -55,6 +300,17 @@ _\fname:
|
|||
#endif
|
||||
.endm
|
||||
|
||||
/* Get symbol location */
|
||||
.macro get_symbol_loc reg, symbol
|
||||
#ifdef __APPLE__
|
||||
adrp \reg, \symbol@PAGE
|
||||
add \reg, \reg, \symbol@PAGEOFF
|
||||
#else
|
||||
adrp \reg, \symbol
|
||||
add \reg, \reg, :lo12:\symbol
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/* Transpose elements of single 128 bit registers */
|
||||
.macro transpose_single x0, x1, xi, xilen, literal
|
||||
ins \xi\xilen[0], \x0\xilen[0]
|
||||
|
@ -139,51 +395,6 @@ _\fname:
|
|||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
#define XFIX_P_0_298 v0.h[0]
|
||||
#define XFIX_N_0_390 v0.h[1]
|
||||
#define XFIX_P_0_541 v0.h[2]
|
||||
|
@ -217,7 +428,7 @@ asm_function jsimd_idct_islow_neon
|
|||
uxtw x3, w3
|
||||
|
||||
sub sp, sp, #64
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
|
||||
mov x10, sp
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
|
||||
|
@ -791,13 +1002,6 @@ asm_function jsimd_idct_islow_neon
|
|||
#define XFIX_1_847759065 v0.h[2]
|
||||
#define XFIX_2_613125930 v0.h[3]
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
|
||||
|
||||
asm_function jsimd_idct_ifast_neon
|
||||
|
||||
DCT_TABLE .req x0
|
||||
|
@ -832,7 +1036,7 @@ asm_function jsimd_idct_ifast_neon
|
|||
* 7 | d30 | d31 ( v23.8h )
|
||||
*/
|
||||
/* Save NEON registers used in fast IDCT */
|
||||
adr TMP5, Ljsimd_idct_ifast_neon_consts
|
||||
get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts
|
||||
ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
|
||||
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
||||
ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
|
||||
|
@ -1023,38 +1227,6 @@ asm_function jsimd_idct_ifast_neon
|
|||
* but readability will suffer somewhat.
|
||||
*/
|
||||
|
||||
#define CONST_BITS 13
|
||||
|
||||
#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
|
||||
#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
|
||||
#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
|
||||
#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
|
||||
#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
|
||||
#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
|
||||
#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
|
||||
#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
|
||||
#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
|
||||
#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
|
||||
#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
|
||||
#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
|
||||
#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.h[0] */
|
||||
.short -FIX_0_765366865 /* v0.h[1] */
|
||||
.short -FIX_0_211164243 /* v0.h[2] */
|
||||
.short FIX_1_451774981 /* v0.h[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* v2.h[0] */
|
||||
.short FIX_2_562915447 /* v2.h[1] */
|
||||
.short 1 << (CONST_BITS + 1) /* v2.h[2] */
|
||||
.short 0 /* v2.h[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
||||
smull v28.4s, \x4, v2.h[2]
|
||||
smlal v28.4s, \x8, v0.h[0]
|
||||
|
@ -1121,7 +1293,7 @@ asm_function jsimd_idct_4x4_neon
|
|||
sub sp, sp, 64
|
||||
mov x9, sp
|
||||
/* Load constants (v3.4h is just used for padding) */
|
||||
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||
|
@ -1264,13 +1436,6 @@ asm_function jsimd_idct_4x4_neon
|
|||
* bit exact compatibility with jpeg-6b.
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
Ljsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* v14[0] */
|
||||
.short FIX_0_850430095 /* v14[1] */
|
||||
.short -FIX_1_272758580 /* v14[2] */
|
||||
.short FIX_3_624509785 /* v14[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||
sshll v15.4s, \x4, #15
|
||||
smull v26.4s, \x6, v14.h[3]
|
||||
|
@ -1311,7 +1476,7 @@ asm_function jsimd_idct_2x2_neon
|
|||
mov x9, sp
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v14.4h}, [TMP2]
|
||||
|
@ -1663,21 +1828,6 @@ asm_function jsimd_idct_2x2_neon
|
|||
do_yuv_to_rgb_stage2
|
||||
.endm
|
||||
|
||||
/* Apple gas crashes on adrl, work around that by using adr.
|
||||
* But this requires a copy of these constants for each function.
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
.if \fast_st3 == 1
|
||||
Ljsimd_ycc_\colorid\()_neon_consts:
|
||||
.else
|
||||
Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
|
||||
.endif
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
.short -128, -128, -128, -128
|
||||
|
||||
.if \fast_st3 == 1
|
||||
asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
.else
|
||||
|
@ -1703,11 +1853,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
|||
mov x9, sp
|
||||
|
||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||
.if \fast_st3 == 1
|
||||
adr x15, Ljsimd_ycc_\colorid\()_neon_consts
|
||||
.else
|
||||
adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
|
||||
.endif
|
||||
get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts
|
||||
|
||||
/* Save NEON registers */
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
|
@ -2004,17 +2150,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b,
|
|||
do_rgb_to_yuv_stage1
|
||||
.endm
|
||||
|
||||
.balign 16
|
||||
.if \fast_ld3 == 1
|
||||
Ljsimd_\colorid\()_ycc_neon_consts:
|
||||
.else
|
||||
Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
|
||||
.endif
|
||||
.short 19595, 38470, 7471, 11059
|
||||
.short 21709, 32768, 27439, 5329
|
||||
.short 32767, 128, 32767, 128
|
||||
.short 32767, 128, 32767, 128
|
||||
|
||||
.if \fast_ld3 == 1
|
||||
asm_function jsimd_\colorid\()_ycc_convert_neon
|
||||
.else
|
||||
|
@ -2037,11 +2172,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
|||
N .req w12
|
||||
|
||||
/* Load constants to d0, d1, d2, d3 */
|
||||
.if \fast_ld3 == 1
|
||||
adr x13, Ljsimd_\colorid\()_ycc_neon_consts
|
||||
.else
|
||||
adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
|
||||
.endif
|
||||
get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
|
||||
ld1 {v0.8h, v1.8h}, [x13]
|
||||
|
||||
ldr OUTPUT_BUF0, [OUTPUT_BUF]
|
||||
|
@ -2241,50 +2372,6 @@ asm_function jsimd_convsamp_neon
|
|||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_islow_neon_consts:
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
#define XFIX_P_0_298 v0.h[0]
|
||||
#define XFIX_N_0_390 v0.h[1]
|
||||
#define XFIX_P_0_541 v0.h[2]
|
||||
|
@ -2304,7 +2391,7 @@ asm_function jsimd_fdct_islow_neon
|
|||
TMP .req x9
|
||||
|
||||
/* Load constants */
|
||||
adr TMP, Ljsimd_fdct_islow_neon_consts
|
||||
get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
|
||||
ld1 {v0.8h, v1.8h}, [TMP]
|
||||
|
||||
/* Save NEON registers */
|
||||
|
@ -2583,20 +2670,13 @@ asm_function jsimd_fdct_islow_neon
|
|||
#define XFIX_0_707106781 v0.h[2]
|
||||
#define XFIX_1_306562965 v0.h[3]
|
||||
|
||||
.balign 16
|
||||
Ljsimd_fdct_ifast_neon_consts:
|
||||
.short (98 * 128) /* XFIX_0_382683433 */
|
||||
.short (139 * 128) /* XFIX_0_541196100 */
|
||||
.short (181 * 128) /* XFIX_0_707106781 */
|
||||
.short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
|
||||
|
||||
asm_function jsimd_fdct_ifast_neon
|
||||
|
||||
DATA .req x0
|
||||
TMP .req x9
|
||||
|
||||
/* Load constants */
|
||||
adr TMP, Ljsimd_fdct_ifast_neon_consts
|
||||
get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts
|
||||
ld1 {v0.4h}, [TMP]
|
||||
|
||||
/* Load all DATA into NEON registers with the following allocation:
|
||||
|
@ -2775,41 +2855,6 @@ asm_function jsimd_quantize_neon
|
|||
* JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
Ljsimd_h2_downsample_neon_consts:
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
|
||||
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
|
||||
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
|
||||
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
|
||||
.byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
|
||||
.byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
|
||||
.byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
|
||||
|
||||
asm_function jsimd_h2v1_downsample_neon
|
||||
IMAGE_WIDTH .req x0
|
||||
MAX_V_SAMP .req x1
|
||||
|
@ -2827,7 +2872,7 @@ asm_function jsimd_h2v1_downsample_neon
|
|||
mov TMPDUP, #0x10000
|
||||
lsl TMP2, BLOCK_WIDTH, #4
|
||||
sub TMP2, TMP2, IMAGE_WIDTH
|
||||
adr TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
add TMP3, TMP3, TMP2, lsl #4
|
||||
dup v16.4s, TMPDUP
|
||||
ld1 {v18.16b}, [TMP3]
|
||||
|
@ -2906,7 +2951,7 @@ asm_function jsimd_h2v2_downsample_neon
|
|||
lsl TMP2, BLOCK_WIDTH, #4
|
||||
lsl TMPDUP, TMPDUP, #17
|
||||
sub TMP2, TMP2, IMAGE_WIDTH
|
||||
adr TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
|
||||
orr TMPDUP, TMPDUP, #1
|
||||
add TMP3, TMP3, TMP2, lsl #4
|
||||
dup v16.4s, TMPDUP
|
||||
|
@ -3012,41 +3057,6 @@ asm_function jsimd_h2v2_downsample_neon
|
|||
|
||||
.macro generate_jsimd_huff_encode_one_block fast_tbl
|
||||
|
||||
.balign 16
|
||||
.if \fast_tbl == 1
|
||||
Ljsimd_huff_encode_one_block_neon_consts:
|
||||
.else
|
||||
Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
|
||||
.endif
|
||||
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
.if \fast_tbl == 1
|
||||
.byte 0, 1, 2, 3, 16, 17, 32, 33, \
|
||||
18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
|
||||
.byte 34, 35, 48, 49, 255, 255, 50, 51, \
|
||||
36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
|
||||
.byte 8, 9, 22, 23, 36, 37, 50, 51, \
|
||||
255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
|
||||
.byte 54, 55, 40, 41, 26, 27, 12, 13, \
|
||||
14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
|
||||
.byte 6, 7, 20, 21, 34, 35, 48, 49, \
|
||||
50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
|
||||
.byte 42, 43, 28, 29, 14, 15, 30, 31, \
|
||||
44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
|
||||
.byte 255, 255, 255, 255, 56, 57, 42, 43, \
|
||||
28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
|
||||
.byte 26, 27, 40, 41, 42, 43, 28, 29, \
|
||||
14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
|
||||
.byte 255, 255, 255, 255, 0, 1, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
|
||||
.byte 4, 5, 6, 7, 255, 255, 255, 255, \
|
||||
255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
|
||||
.endif
|
||||
|
||||
.if \fast_tbl == 1
|
||||
asm_function jsimd_huff_encode_one_block_neon
|
||||
.else
|
||||
|
@ -3056,11 +3066,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
|||
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
|
||||
/* Save ARM registers */
|
||||
stp x19, x20, [sp]
|
||||
.if \fast_tbl == 1
|
||||
adr x15, Ljsimd_huff_encode_one_block_neon_consts
|
||||
.else
|
||||
adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
|
||||
.endif
|
||||
get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
|
||||
ldr PUT_BUFFER, [x0, #0x10]
|
||||
ldr PUT_BITSw, [x0, #0x18]
|
||||
ldrsh w12, [x2] /* load DC coeff in w12 */
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -110,12 +108,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, BYTE [esi+ecx]
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, WORD [esi+ecx]
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -111,13 +109,13 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
|
|||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
xor eax, eax
|
||||
mov al, BYTE [esi+ecx]
|
||||
mov al, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
xor edx, edx
|
||||
mov dx, WORD [esi+ecx]
|
||||
mov dx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
|
@ -127,7 +125,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
|
|||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd mmG, DWORD [esi+ecx]
|
||||
movd mmG, dword [esi+ecx]
|
||||
psllq mmA, DWORD_BIT
|
||||
por mmA, mmG
|
||||
.column_ld8:
|
||||
|
@ -197,7 +195,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
|
|||
test cl, SIZEOF_MMWORD/8
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_MMWORD/8
|
||||
movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_MMWORD/4
|
||||
jz short .column_ld4
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -109,12 +107,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, BYTE [esi+ecx]
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, WORD [esi+ecx]
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -102,12 +100,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, BYTE [esi+ecx]
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, WORD [esi+ecx]
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -103,13 +101,13 @@ EXTN(jsimd_rgb_gray_convert_mmx):
|
|||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
xor eax, eax
|
||||
mov al, BYTE [esi+ecx]
|
||||
mov al, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
xor edx, edx
|
||||
mov dx, WORD [esi+ecx]
|
||||
mov dx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
|
@ -119,7 +117,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
|
|||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
movd mmG, DWORD [esi+ecx]
|
||||
movd mmG, dword [esi+ecx]
|
||||
psllq mmA, DWORD_BIT
|
||||
por mmA, mmG
|
||||
.column_ld8:
|
||||
|
@ -189,7 +187,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
|
|||
test cl, SIZEOF_MMWORD/8
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_MMWORD/8
|
||||
movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
|
||||
movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_MMWORD/4
|
||||
jz short .column_ld4
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -101,12 +99,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub ecx, byte SIZEOF_BYTE
|
||||
movzx eax, BYTE [esi+ecx]
|
||||
movzx eax, byte [esi+ecx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
movzx edx, WORD [esi+ecx]
|
||||
movzx edx, word [esi+ecx]
|
||||
shl eax, WORD_BIT
|
||||
or eax, edx
|
||||
.column_ld4:
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains an SSE2 implementation for Huffman coding of one block.
|
||||
; The following code is based directly on jchuff.c; see jchuff.c for more
|
||||
; details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
@ -197,8 +195,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
push ebp
|
||||
|
||||
mov esi, POINTER [eax+8] ; (working_state *state)
|
||||
mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
|
||||
mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
|
||||
mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer;
|
||||
mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits;
|
||||
push esi ; esi is now scratch
|
||||
|
||||
get_GOT edx ; get GOT address
|
||||
|
@ -214,7 +212,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
; Encode the DC coefficient difference per section F.1.2.1
|
||||
mov esi, POINTER [esp+block] ; block
|
||||
movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
|
||||
sub ecx, DWORD [eax+20]
|
||||
sub ecx, dword [eax+20]
|
||||
mov esi, ecx
|
||||
|
||||
; This is a well-known technique for obtaining the absolute value
|
||||
|
@ -229,12 +227,12 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
; For a negative input, want temp2 = bitwise complement of abs(input)
|
||||
; This code assumes we are on a two's complement machine
|
||||
add esi, edx ; temp2 += temp3;
|
||||
mov DWORD [esp+temp], esi ; backup temp2 in temp
|
||||
mov dword [esp+temp], esi ; backup temp2 in temp
|
||||
|
||||
; Find the number of bits needed for the magnitude of the coefficient
|
||||
movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
|
||||
movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
|
||||
mov DWORD [esp+temp2], edx ; backup nbits in temp2
|
||||
mov dword [esp+temp2], edx ; backup nbits in temp2
|
||||
|
||||
; Emit the Huffman-coded symbol for the number of bits
|
||||
mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
|
||||
|
@ -242,13 +240,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
|
||||
EMIT_BITS eax ; EMIT_BITS(code, size)
|
||||
|
||||
mov ecx, DWORD [esp+temp2] ; restore nbits
|
||||
mov ecx, dword [esp+temp2] ; restore nbits
|
||||
|
||||
; Mask off any extra bits in code
|
||||
mov eax, 1
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, DWORD [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
|
||||
; Emit that number of bits of the value, if positive,
|
||||
; or the complement of its magnitude, if negative.
|
||||
|
@ -291,22 +289,22 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
jz near .ELOOP
|
||||
lea esi, [esi+ecx*2] ; k += r;
|
||||
shr edx, cl ; index >>= r;
|
||||
mov DWORD [esp+temp3], edx
|
||||
mov dword [esp+temp3], edx
|
||||
.BRLOOP:
|
||||
cmp ecx, 16 ; while (r > 15) {
|
||||
jl near .ERLOOP
|
||||
sub ecx, 16 ; r -= 16;
|
||||
mov DWORD [esp+temp], ecx
|
||||
mov dword [esp+temp], ecx
|
||||
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
|
||||
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
|
||||
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
|
||||
mov ecx, DWORD [esp+temp]
|
||||
mov ecx, dword [esp+temp]
|
||||
jmp .BRLOOP
|
||||
.ERLOOP:
|
||||
movsx eax, word [esi] ; temp = t1[k];
|
||||
movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
|
||||
movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
|
||||
mov DWORD [esp+temp2], eax
|
||||
mov dword [esp+temp2], eax
|
||||
; Emit Huffman symbol for run length / number of bits
|
||||
shl ecx, 4 ; temp3 = (r << 4) + nbits;
|
||||
add ecx, eax
|
||||
|
@ -316,13 +314,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
|
||||
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
|
||||
; Mask off any extra bits in code
|
||||
mov ecx, DWORD [esp+temp2]
|
||||
mov ecx, dword [esp+temp2]
|
||||
mov eax, 1
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
|
||||
mov edx, DWORD [esp+temp3]
|
||||
mov edx, dword [esp+temp3]
|
||||
add esi, 2 ; ++k;
|
||||
shr edx, 1 ; index >>= 1;
|
||||
|
||||
|
@ -352,29 +350,29 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
shr edx, cl ; index >>= r;
|
||||
add ecx, eax
|
||||
lea esi, [esi+ecx*2] ; k += r;
|
||||
mov DWORD [esp+temp3], edx
|
||||
mov dword [esp+temp3], edx
|
||||
jmp .BRLOOP2
|
||||
.BLOOP2:
|
||||
bsf ecx, edx ; r = __builtin_ctzl(index);
|
||||
jz near .ELOOP2
|
||||
lea esi, [esi+ecx*2] ; k += r;
|
||||
shr edx, cl ; index >>= r;
|
||||
mov DWORD [esp+temp3], edx
|
||||
mov dword [esp+temp3], edx
|
||||
.BRLOOP2:
|
||||
cmp ecx, 16 ; while (r > 15) {
|
||||
jl near .ERLOOP2
|
||||
sub ecx, 16 ; r -= 16;
|
||||
mov DWORD [esp+temp], ecx
|
||||
mov dword [esp+temp], ecx
|
||||
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
|
||||
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
|
||||
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
|
||||
mov ecx, DWORD [esp+temp]
|
||||
mov ecx, dword [esp+temp]
|
||||
jmp .BRLOOP2
|
||||
.ERLOOP2:
|
||||
movsx eax, word [esi] ; temp = t1[k];
|
||||
bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
|
||||
inc eax
|
||||
mov DWORD [esp+temp2], eax
|
||||
mov dword [esp+temp2], eax
|
||||
; Emit Huffman symbol for run length / number of bits
|
||||
shl ecx, 4 ; temp3 = (r << 4) + nbits;
|
||||
add ecx, eax
|
||||
|
@ -384,13 +382,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
|
||||
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
|
||||
; Mask off any extra bits in code
|
||||
mov ecx, DWORD [esp+temp2]
|
||||
mov ecx, dword [esp+temp2]
|
||||
mov eax, 1
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
|
||||
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
|
||||
mov edx, DWORD [esp+temp3]
|
||||
mov edx, dword [esp+temp3]
|
||||
add esi, 2 ; ++k;
|
||||
shr edx, 1 ; index >>= 1;
|
||||
|
||||
|
@ -407,8 +405,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
mov eax, [esp+buffer]
|
||||
pop esi
|
||||
; Save put_buffer & put_bits
|
||||
mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
|
||||
mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
|
||||
mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
|
||||
mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits;
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
;
|
||||
; This file contains an SSE2 implementation of data preparation for progressive
|
||||
; Huffman encoding. See jcphuff.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -348,7 +346,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
|
|||
vmovd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi], ax
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
|
@ -357,7 +355,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
|
|||
; space.
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
mov BYTE [edi], al
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -280,7 +278,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
|
|||
movd eax, mmA
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st2
|
||||
mov DWORD [edi+0*SIZEOF_DWORD], eax
|
||||
mov dword [edi+0*SIZEOF_DWORD], eax
|
||||
psrlq mmA, DWORD_BIT
|
||||
movd eax, mmA
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
|
@ -288,14 +286,14 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
|
|||
.column_st2:
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi+0*SIZEOF_WORD], ax
|
||||
mov word [edi+0*SIZEOF_WORD], ax
|
||||
shr eax, WORD_BIT
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
add edi, byte SIZEOF_WORD
|
||||
.column_st1:
|
||||
cmp ecx, byte SIZEOF_BYTE
|
||||
jb short .nextrow
|
||||
mov BYTE [edi+0*SIZEOF_BYTE], al
|
||||
mov byte [edi+0*SIZEOF_BYTE], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
@ -367,7 +365,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
|
|||
.column_st4:
|
||||
cmp ecx, byte SIZEOF_MMWORD/8
|
||||
jb short .nextrow
|
||||
movd DWORD [edi+0*SIZEOF_DWORD], mmA
|
||||
movd dword [edi+0*SIZEOF_DWORD], mmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -320,7 +318,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||
movd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi], ax
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
|
@ -329,7 +327,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||
; space.
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
mov BYTE [edi], al
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -354,7 +352,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
|
|||
vmovd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi], ax
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
|
@ -363,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
|
|||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
mov BYTE [edi], al
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -283,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
|
|||
movd eax, mmA
|
||||
cmp ecx, byte SIZEOF_DWORD
|
||||
jb short .column_st2
|
||||
mov DWORD [edi+0*SIZEOF_DWORD], eax
|
||||
mov dword [edi+0*SIZEOF_DWORD], eax
|
||||
psrlq mmA, DWORD_BIT
|
||||
movd eax, mmA
|
||||
sub ecx, byte SIZEOF_DWORD
|
||||
|
@ -291,14 +289,14 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
|
|||
.column_st2:
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi+0*SIZEOF_WORD], ax
|
||||
mov word [edi+0*SIZEOF_WORD], ax
|
||||
shr eax, WORD_BIT
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
add edi, byte SIZEOF_WORD
|
||||
.column_st1:
|
||||
cmp ecx, byte SIZEOF_BYTE
|
||||
jb short .endcolumn
|
||||
mov BYTE [edi+0*SIZEOF_BYTE], al
|
||||
mov byte [edi+0*SIZEOF_BYTE], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
@ -373,7 +371,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
|
|||
.column_st4:
|
||||
cmp ecx, byte SIZEOF_MMWORD/8
|
||||
jb short .endcolumn
|
||||
movd DWORD [edi+0*SIZEOF_DWORD], mmA
|
||||
movd dword [edi+0*SIZEOF_DWORD], mmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -325,7 +323,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||
movd eax, xmmA
|
||||
cmp ecx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [edi], ax
|
||||
mov word [edi], ax
|
||||
add edi, byte SIZEOF_WORD
|
||||
sub ecx, byte SIZEOF_WORD
|
||||
shr eax, 16
|
||||
|
@ -334,7 +332,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
mov BYTE [edi], al
|
||||
mov byte [edi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -92,23 +90,23 @@ EXTN(jsimd_idct_float_3dnow):
|
|||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
pushpic ebx ; save GOT address
|
||||
mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, ebx
|
||||
poppic ebx ; restore GOT address
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd mm0, mm0
|
||||
psrad mm0, (DWORD_BIT-WORD_BIT)
|
||||
|
@ -135,10 +133,10 @@ EXTN(jsimd_idct_float_3dnow):
|
|||
|
||||
; -- Even part
|
||||
|
||||
movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd mm0, mm0
|
||||
punpcklwd mm1, mm1
|
||||
|
@ -182,10 +180,10 @@ EXTN(jsimd_idct_float_3dnow):
|
|||
|
||||
; -- Odd part
|
||||
|
||||
movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
|
||||
movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd mm2, mm2
|
||||
punpcklwd mm3, mm3
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse):
|
|||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse2):
|
|||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -123,8 +121,8 @@ EXTN(jsimd_idct_ifast_mmx):
|
|||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -118,8 +116,8 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -320,8 +318,8 @@ EXTN(jsimd_idct_islow_avx2):
|
|||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -136,8 +134,8 @@ EXTN(jsimd_idct_islow_mmx):
|
|||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -131,8 +129,8 @@ EXTN(jsimd_idct_islow_sse2):
|
|||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -144,8 +142,8 @@ EXTN(jsimd_idct_4x4_mmx):
|
|||
alignx 16, 7
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
@ -464,16 +462,16 @@ EXTN(jsimd_idct_4x4_mmx):
|
|||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
|
||||
movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
|
||||
psrlq mm1, 4*BYTE_BIT
|
||||
psrlq mm0, 4*BYTE_BIT
|
||||
|
||||
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
|
||||
movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
|
||||
movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
|
@ -688,8 +686,8 @@ EXTN(jsimd_idct_2x2_mmx):
|
|||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
mov word [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov word [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
emms ; empty MMX state
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
@ -139,8 +137,8 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
|
||||
mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
|
||||
|
@ -578,8 +576,8 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||
|
||||
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
|
||||
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
|
||||
mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
mov word [edx+eax*SIZEOF_JSAMPLE], bx
|
||||
mov word [esi+eax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -7,8 +7,6 @@
|
|||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -7,8 +7,6 @@
|
|||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
; Each IDCT routine is responsible for range-limiting its results and
|
||||
; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
; jsimdext.inc - common declarations
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2010, 2016, D. R. Commander.
|
||||
; Copyright (C) 2010, 2016, 2019, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthieu Darbois.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
|
||||
|
@ -24,8 +24,6 @@
|
|||
; 2. Altered source versions must be plainly marked as such, and must not be
|
||||
; misrepresented as being the original software.
|
||||
; 3. This notice may not be removed or altered from any source distribution.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
; ==========================================================================
|
||||
; System-dependent configurations
|
||||
|
@ -167,19 +165,19 @@ section .note.GNU-stack noalloc noexec nowrite progbits
|
|||
%define XMM_DWORD
|
||||
%define XMM_MMWORD
|
||||
|
||||
%define SIZEOF_BYTE 1 ; sizeof(BYTE)
|
||||
%define SIZEOF_WORD 2 ; sizeof(WORD)
|
||||
%define SIZEOF_DWORD 4 ; sizeof(DWORD)
|
||||
%define SIZEOF_QWORD 8 ; sizeof(QWORD)
|
||||
%define SIZEOF_OWORD 16 ; sizeof(OWORD)
|
||||
%define SIZEOF_YWORD 32 ; sizeof(YWORD)
|
||||
%define SIZEOF_BYTE 1 ; sizeof(byte)
|
||||
%define SIZEOF_WORD 2 ; sizeof(word)
|
||||
%define SIZEOF_DWORD 4 ; sizeof(dword)
|
||||
%define SIZEOF_QWORD 8 ; sizeof(qword)
|
||||
%define SIZEOF_OWORD 16 ; sizeof(oword)
|
||||
%define SIZEOF_YWORD 32 ; sizeof(yword)
|
||||
|
||||
%define BYTE_BIT 8 ; CHAR_BIT in C
|
||||
%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
|
||||
%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
|
||||
%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
|
||||
%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
|
||||
%define YWORD_BIT 256 ; sizeof(YWORD)*BYTE_BIT
|
||||
%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
|
||||
%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
|
||||
%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
|
||||
%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
|
||||
%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; External Symbol Name
|
||||
|
@ -198,6 +196,11 @@ section .note.GNU-stack noalloc noexec nowrite progbits
|
|||
%ifdef __YASM_VER__
|
||||
%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
|
||||
%define GLOBAL_DATA(name) global EXTN(name):private_extern
|
||||
%else
|
||||
%if __NASM_VERSION_ID__ >= 0x020E0000
|
||||
%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
|
||||
%define GLOBAL_DATA(name) global EXTN(name):private_extern
|
||||
%endif
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -96,12 +94,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, BYTE [rsi+rcx]
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, WORD [rsi+rcx]
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -95,12 +93,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, BYTE [rsi+rcx]
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, WORD [rsi+rcx]
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -88,12 +86,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, BYTE [rsi+rcx]
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, WORD [rsi+rcx]
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -87,12 +85,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, BYTE [rsi+rcx]
|
||||
movzx rax, byte [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, WORD [rsi+rcx]
|
||||
movzx rdx, word [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax, rdx
|
||||
.column_ld4:
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
; This file contains an SSE2 implementation for Huffman coding of one block.
|
||||
; The following code is based directly on jchuff.c; see jchuff.c for more
|
||||
; details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
@ -200,7 +198,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
mov buffer, r11 ; r11 is now sratch
|
||||
|
||||
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
|
||||
mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
|
||||
mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits;
|
||||
push r10 ; r10 is now scratch
|
||||
|
||||
; Encode the DC coefficient difference per section F.1.2.1
|
||||
|
@ -333,7 +331,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||
pop r10
|
||||
; Save put_buffer & put_bits
|
||||
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
|
||||
mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
|
||||
mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits;
|
||||
|
||||
pop rbx
|
||||
uncollect_args 6
|
||||
|
|
|
@ -16,8 +16,6 @@
|
|||
;
|
||||
; This file contains an SSE2 implementation of data preparation for progressive
|
||||
; Huffman encoding. See jcphuff.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -334,7 +332,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
|
|||
vmovd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [rdi], ax
|
||||
mov word [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
|
@ -343,7 +341,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
|
|||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
mov BYTE [rdi], al
|
||||
mov byte [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
|
@ -306,7 +304,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||
movd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [rdi], ax
|
||||
mov word [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
|
@ -315,7 +313,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
mov BYTE [rdi], al
|
||||
mov byte [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
|
|
|
@ -14,8 +14,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue