mirror of
https://github.com/ZDoom/qzdoom-gpl.git
synced 2024-11-29 15:12:11 +00:00
- started adding ARM support. incomplete. won't compile. don't try.
This commit is contained in:
parent
09530e9496
commit
be8abba344
5 changed files with 1448 additions and 0 deletions
|
@ -502,6 +502,11 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
|
|||
set( CMAKE_EXE_LINKER_FLAGS "-stdlib=libc++ ${CMAKE_EXE_LINKER_FLAGS}" )
|
||||
endif ()
|
||||
|
||||
# ARM processors (Raspberry Pi) - enable ARM NEON support.
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
|
||||
set( CMAKE_CXX_FLAGS "-mfpu=neon ${CMAKE_CXX_FLAGS}" )
|
||||
endif ()
|
||||
|
||||
# Remove extra warnings when using the official DirectX headers.
|
||||
# Also, TDM-GCC 4.4.0 no longer accepts glibc-style printf formats as valid,
|
||||
# which is a royal pain. The previous version I had been using was fine with them.
|
||||
|
|
1198
src/ila/SSE2NEON.h
Normal file
1198
src/ila/SSE2NEON.h
Normal file
File diff suppressed because it is too large
Load diff
56
src/ila/ila.h
Normal file
56
src/ila/ila.h
Normal file
|
@ -0,0 +1,56 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2017 Rachael Alexanderson
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
================================================================================
|
||||
|
||||
Simple In-Line Assembly support for ARM
|
||||
|
||||
The purpose of this file is to provide SSE2 translations to ARM NEON. Note
|
||||
that this requires a special GCC flag (which should automatically be set by
|
||||
CMAKE).
|
||||
|
||||
As stated in the other included file - this will never be as fast as remaking
|
||||
the SSE2 instructions from the ground up. But since I have no interest in
|
||||
doing so, myself - have at it if you want proper ARM support!
|
||||
|
||||
For Raspberry Pi, note that this requires RPI2 or higher.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
#include "ila/sse_to_neon.hpp"
|
||||
// credit to Magnus Norddahl
|
||||
inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
|
||||
return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
|
||||
}
|
||||
#else
|
||||
// Just standard old Intel!
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
187
src/ila/sse_to_neon.hpp
Normal file
187
src/ila/sse_to_neon.hpp
Normal file
|
@ -0,0 +1,187 @@
|
|||
//
|
||||
// sse_to_neon.hpp
|
||||
// neon_test
|
||||
//
|
||||
// Created by Tim Oberhauser on 11/16/13.
|
||||
// Copyright (c) 2013 Tim Oberhauser. All rights reserved.
|
||||
//
|
||||
|
||||
#ifndef neon_test_sse_to_neon_hpp
|
||||
#define neon_test_sse_to_neon_hpp
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(__MM_MALLOC_H)
|
||||
// copied from mm_malloc.h {
|
||||
#include <stdlib.h>
|
||||
|
||||
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
|
||||
may not be visible. */
|
||||
#ifndef __cplusplus
|
||||
extern int posix_memalign (void **, size_t, size_t);
|
||||
#else
|
||||
extern "C" int posix_memalign (void **, size_t, size_t) throw ();
|
||||
#endif
|
||||
|
||||
static __inline void *
|
||||
_mm_malloc (size_t size, size_t alignment)
|
||||
{
|
||||
void *ptr;
|
||||
if (alignment == 1)
|
||||
return malloc (size);
|
||||
if (alignment == 2 || (sizeof (void *) == 8 && alignment == 4))
|
||||
alignment = sizeof (void *);
|
||||
if (posix_memalign (&ptr, alignment, size) == 0)
|
||||
return ptr;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static __inline void
|
||||
_mm_free (void * ptr)
|
||||
{
|
||||
free (ptr);
|
||||
}
|
||||
// } copied from mm_malloc.h
|
||||
#endif
|
||||
|
||||
|
||||
typedef int16x8_t __m128i;
|
||||
typedef float32x4_t __m128;
|
||||
|
||||
|
||||
// ADDITION
|
||||
inline __m128i _mm_add_epi16(const __m128i& a, const __m128i& b){
|
||||
return vaddq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
|
||||
}
|
||||
|
||||
inline __m128 _mm_add_ps(const __m128& a, const __m128& b){
|
||||
return vaddq_f32(a,b);
|
||||
}
|
||||
|
||||
|
||||
// SUBTRACTION
|
||||
inline __m128i _mm_sub_epi16(const __m128i& a, const __m128i& b){
|
||||
return vsubq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
|
||||
}
|
||||
|
||||
inline __m128 _mm_sub_ps(const __m128& a, const __m128& b){
|
||||
return vsubq_f32(a,b);
|
||||
}
|
||||
|
||||
|
||||
// MULTIPLICATION
|
||||
#if 0
|
||||
inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
|
||||
return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
|
||||
}
|
||||
#endif
|
||||
|
||||
inline __m128 _mm_mul_ps(const __m128& a, const __m128& b){
|
||||
return vmulq_f32(a,b);
|
||||
}
|
||||
|
||||
|
||||
// SET VALUE
|
||||
inline __m128i _mm_set1_epi16(const int16_t w){
|
||||
return vmovq_n_s16(w);
|
||||
}
|
||||
|
||||
inline __m128i _mm_setzero_si128(){
|
||||
return vmovq_n_s16(0);
|
||||
}
|
||||
|
||||
inline __m128 _mm_set1_ps(const float32_t& w){
|
||||
return vmovq_n_f32(w);
|
||||
}
|
||||
|
||||
|
||||
// STORE
|
||||
inline void _mm_storeu_si128(__m128i* p, __m128i& a){
|
||||
vst1q_s16(reinterpret_cast<int16_t*>(p),reinterpret_cast<int16x8_t>(a));
|
||||
}
|
||||
|
||||
inline void _mm_store_ps(float32_t* p, __m128&a){
|
||||
vst1q_f32(p,a);
|
||||
}
|
||||
|
||||
|
||||
// LOAD
|
||||
inline __m128i _mm_loadu_si128(__m128i* p){//For SSE address p does not need be 16-byte aligned
|
||||
return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
|
||||
}
|
||||
|
||||
inline __m128i _mm_load_si128(__m128i* p){//For SSE address p must be 16-byte aligned
|
||||
return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
|
||||
}
|
||||
|
||||
inline __m128 _mm_load_ps(const float32_t* p){
|
||||
return reinterpret_cast<__m128>(vld1q_f32(p));
|
||||
}
|
||||
|
||||
|
||||
// SHIFT OPERATIONS
|
||||
inline __m128i _mm_srai_epi16(const __m128i& a, const int count){
|
||||
int16x8_t b = vmovq_n_s16(-count);
|
||||
return reinterpret_cast<__m128i>(vshlq_s16(a,b));
|
||||
// return vrshrq_n_s16(a, count);// TODO Argument to '__builtin_neon_vrshrq_n_v' must be a constant integer
|
||||
}
|
||||
|
||||
|
||||
// MIN/MAX OPERATIONS
|
||||
inline __m128 _mm_max_ps(const __m128& a, const __m128& b){
|
||||
return reinterpret_cast<__m128>(vmaxq_f32(reinterpret_cast<float32x4_t>(a),reinterpret_cast<float32x4_t>(b)));
|
||||
}
|
||||
|
||||
|
||||
// SINGLE ELEMENT ACCESS
|
||||
inline int16_t _mm_extract_epi16(__m128i& a, int index){
|
||||
return (reinterpret_cast<int16_t*>(&a))[index];
|
||||
// return vgetq_lane_s16(a,index);// TODO Argument to '__builtin_neon_vgetq_lane_i16' must be a constant integer
|
||||
}
|
||||
|
||||
|
||||
// MISCELLANOUS
|
||||
inline __m128i _mm_sad_epu8 (__m128i a, __m128i b){
|
||||
uint64x2_t sad = reinterpret_cast<uint64x2_t>(vabdq_u8(reinterpret_cast<uint8x16_t>(a),reinterpret_cast<uint8x16_t>(b)));
|
||||
sad = reinterpret_cast<uint64x2_t>(vpaddlq_u8(reinterpret_cast<uint8x16_t>(sad)));
|
||||
sad = reinterpret_cast<uint64x2_t>(vpaddlq_u16(reinterpret_cast<uint16x8_t>(sad)));
|
||||
sad = vpaddlq_u32(reinterpret_cast<uint32x4_t>(sad));
|
||||
return reinterpret_cast<__m128i>(sad);
|
||||
}
|
||||
|
||||
|
||||
// LOGICAL OPERATIONS
|
||||
inline __m128 _mm_and_ps(__m128& a, __m128& b){
|
||||
return reinterpret_cast<__m128>(vandq_u32(reinterpret_cast<uint32x4_t>(a),reinterpret_cast<uint32x4_t>(b)));
|
||||
}
|
||||
|
||||
|
||||
// CONVERSIONS
|
||||
inline __m128i _mm_packus_epi16 (const __m128i a, const __m128i b){
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int8x8_t* a_narrow = reinterpret_cast<int8x8_t*>(&result);
|
||||
int8x8_t* b_narrow = &a_narrow[1];
|
||||
*a_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(a));
|
||||
*b_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(b));
|
||||
return result;
|
||||
}
|
||||
|
||||
// In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
|
||||
inline __m128i _mm_unpacklo_epi8(__m128i a, const __m128i dummy_zero){
|
||||
// dummy_zero is a dummy variable
|
||||
uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
|
||||
return reinterpret_cast<__m128i>(vmovl_u8(*a_low));
|
||||
}
|
||||
|
||||
// In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
|
||||
inline __m128i _mm_unpackhi_epi8(__m128i a, const __m128i dummy_zero){
|
||||
// dummy_zero is a dummy variable
|
||||
uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
|
||||
return reinterpret_cast<__m128i>(vmovl_u8(a_low[1]));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
|
@ -36,6 +36,8 @@
|
|||
|
||||
#ifndef NO_SSE
|
||||
#include <immintrin.h>
|
||||
#else
|
||||
#include "ila/ila.h"
|
||||
#endif
|
||||
|
||||
struct FSpecialColormap;
|
||||
|
|
Loading…
Reference in a new issue