- started adding ARM support. incomplete. won't compile. don't try.

This commit is contained in:
Rachael Alexanderson 2017-03-04 16:50:42 -05:00
parent 09530e9496
commit be8abba344
5 changed files with 1448 additions and 0 deletions

View File

@ -502,6 +502,11 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
set( CMAKE_EXE_LINKER_FLAGS "-stdlib=libc++ ${CMAKE_EXE_LINKER_FLAGS}" ) set( CMAKE_EXE_LINKER_FLAGS "-stdlib=libc++ ${CMAKE_EXE_LINKER_FLAGS}" )
endif () endif ()
# ARM processors (Raspberry Pi) - enable ARM NEON support.
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
set( CMAKE_CXX_FLAGS "-mfpu=neon ${CMAKE_CXX_FLAGS}" )
endif ()
# Remove extra warnings when using the official DirectX headers. # Remove extra warnings when using the official DirectX headers.
# Also, TDM-GCC 4.4.0 no longer accepts glibc-style printf formats as valid, # Also, TDM-GCC 4.4.0 no longer accepts glibc-style printf formats as valid,
# which is a royal pain. The previous version I had been using was fine with them. # which is a royal pain. The previous version I had been using was fine with them.

1198
src/ila/SSE2NEON.h Normal file

File diff suppressed because it is too large Load Diff

56
src/ila/ila.h Normal file
View File

@ -0,0 +1,56 @@
/*******************************************************************************
Copyright (c) 2017 Rachael Alexanderson
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
================================================================================
Simple In-Line Assembly support for ARM
The purpose of this file is to provide SSE2 translations to ARM NEON. Note
that this requires a special GCC flag (which should automatically be set by
CMAKE).
As stated in the other included file - this will never be as fast as remaking
the SSE2 instructions from the ground up. But since I have no interest in
doing so, myself - have at it if you want proper ARM support!
For Raspberry Pi, note that this requires RPI2 or higher.
*******************************************************************************/
#if defined(__ARM_NEON__)
#include "ila/sse_to_neon.hpp"
// credit to Magnus Norddahl
inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
}
#else
// Just standard old Intel!
#include <emmintrin.h>
#endif

187
src/ila/sse_to_neon.hpp Normal file
View File

@ -0,0 +1,187 @@
//
// sse_to_neon.hpp
// neon_test
//
// Created by Tim Oberhauser on 11/16/13.
// Copyright (c) 2013 Tim Oberhauser. All rights reserved.
//
#ifndef neon_test_sse_to_neon_hpp
#define neon_test_sse_to_neon_hpp
#include <arm_neon.h>
#if defined(__MM_MALLOC_H)
// copied from mm_malloc.h {
#include <stdlib.h>
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
may not be visible. */
#ifndef __cplusplus
extern int posix_memalign (void **, size_t, size_t);
#else
extern "C" int posix_memalign (void **, size_t, size_t) throw ();
#endif
static __inline void *
_mm_malloc (size_t size, size_t alignment)
{
void *ptr;
if (alignment == 1)
return malloc (size);
if (alignment == 2 || (sizeof (void *) == 8 && alignment == 4))
alignment = sizeof (void *);
if (posix_memalign (&ptr, alignment, size) == 0)
return ptr;
else
return NULL;
}
static __inline void
_mm_free (void * ptr)
{
free (ptr);
}
// } copied from mm_malloc.h
#endif
typedef int16x8_t __m128i;
typedef float32x4_t __m128;
// ADDITION
inline __m128i _mm_add_epi16(const __m128i& a, const __m128i& b){
return vaddq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
}
inline __m128 _mm_add_ps(const __m128& a, const __m128& b){
return vaddq_f32(a,b);
}
// SUBTRACTION
inline __m128i _mm_sub_epi16(const __m128i& a, const __m128i& b){
return vsubq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
}
inline __m128 _mm_sub_ps(const __m128& a, const __m128& b){
return vsubq_f32(a,b);
}
// MULTIPLICATION
#if 0
inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
}
#endif
inline __m128 _mm_mul_ps(const __m128& a, const __m128& b){
return vmulq_f32(a,b);
}
// SET VALUE
inline __m128i _mm_set1_epi16(const int16_t w){
return vmovq_n_s16(w);
}
inline __m128i _mm_setzero_si128(){
return vmovq_n_s16(0);
}
inline __m128 _mm_set1_ps(const float32_t& w){
return vmovq_n_f32(w);
}
// STORE
inline void _mm_storeu_si128(__m128i* p, __m128i& a){
vst1q_s16(reinterpret_cast<int16_t*>(p),reinterpret_cast<int16x8_t>(a));
}
inline void _mm_store_ps(float32_t* p, __m128&a){
vst1q_f32(p,a);
}
// LOAD
inline __m128i _mm_loadu_si128(__m128i* p){//For SSE address p does not need be 16-byte aligned
return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
}
inline __m128i _mm_load_si128(__m128i* p){//For SSE address p must be 16-byte aligned
return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
}
inline __m128 _mm_load_ps(const float32_t* p){
return reinterpret_cast<__m128>(vld1q_f32(p));
}
// SHIFT OPERATIONS
inline __m128i _mm_srai_epi16(const __m128i& a, const int count){
int16x8_t b = vmovq_n_s16(-count);
return reinterpret_cast<__m128i>(vshlq_s16(a,b));
// return vrshrq_n_s16(a, count);// TODO Argument to '__builtin_neon_vrshrq_n_v' must be a constant integer
}
// MIN/MAX OPERATIONS
inline __m128 _mm_max_ps(const __m128& a, const __m128& b){
return reinterpret_cast<__m128>(vmaxq_f32(reinterpret_cast<float32x4_t>(a),reinterpret_cast<float32x4_t>(b)));
}
// SINGLE ELEMENT ACCESS
inline int16_t _mm_extract_epi16(__m128i& a, int index){
return (reinterpret_cast<int16_t*>(&a))[index];
// return vgetq_lane_s16(a,index);// TODO Argument to '__builtin_neon_vgetq_lane_i16' must be a constant integer
}
// MISCELLANOUS
inline __m128i _mm_sad_epu8 (__m128i a, __m128i b){
uint64x2_t sad = reinterpret_cast<uint64x2_t>(vabdq_u8(reinterpret_cast<uint8x16_t>(a),reinterpret_cast<uint8x16_t>(b)));
sad = reinterpret_cast<uint64x2_t>(vpaddlq_u8(reinterpret_cast<uint8x16_t>(sad)));
sad = reinterpret_cast<uint64x2_t>(vpaddlq_u16(reinterpret_cast<uint16x8_t>(sad)));
sad = vpaddlq_u32(reinterpret_cast<uint32x4_t>(sad));
return reinterpret_cast<__m128i>(sad);
}
// LOGICAL OPERATIONS
inline __m128 _mm_and_ps(__m128& a, __m128& b){
return reinterpret_cast<__m128>(vandq_u32(reinterpret_cast<uint32x4_t>(a),reinterpret_cast<uint32x4_t>(b)));
}
// CONVERSIONS
inline __m128i _mm_packus_epi16 (const __m128i a, const __m128i b){
__m128i result = _mm_setzero_si128();
int8x8_t* a_narrow = reinterpret_cast<int8x8_t*>(&result);
int8x8_t* b_narrow = &a_narrow[1];
*a_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(a));
*b_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(b));
return result;
}
// In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
inline __m128i _mm_unpacklo_epi8(__m128i a, const __m128i dummy_zero){
// dummy_zero is a dummy variable
uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
return reinterpret_cast<__m128i>(vmovl_u8(*a_low));
}
// In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
inline __m128i _mm_unpackhi_epi8(__m128i a, const __m128i dummy_zero){
// dummy_zero is a dummy variable
uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
return reinterpret_cast<__m128i>(vmovl_u8(a_low[1]));
}
#endif

View File

@ -36,6 +36,8 @@
#ifndef NO_SSE #ifndef NO_SSE
#include <immintrin.h> #include <immintrin.h>
#else
#include "ila/ila.h"
#endif #endif
struct FSpecialColormap; struct FSpecialColormap;