- started adding ARM support. incomplete. won't compile. don't try.

2017-03-04 16:50:42 -05:00 · 2017-03-04 16:50:42 -05:00 · be8abba344
parent 09530e9496
commit be8abba344
5 changed files with 1448 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -502,6 +502,11 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
 		set( CMAKE_EXE_LINKER_FLAGS "-stdlib=libc++ ${CMAKE_EXE_LINKER_FLAGS}" )
 	endif ()
 	# ARM processors (Raspberry Pi) - enable ARM NEON support.
 	if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") 
 		set( CMAKE_CXX_FLAGS "-mfpu=neon ${CMAKE_CXX_FLAGS}" )
 	endif ()
 	# Remove extra warnings when using the official DirectX headers.
 	# Also, TDM-GCC 4.4.0 no longer accepts glibc-style printf formats as valid,
 	# which is a royal pain. The previous version I had been using was fine with them.
--- a/src/ila/SSE2NEON.h
+++ b/src/ila/SSE2NEON.h
--- a/src/ila/ila.h
+++ b/src/ila/ila.h
@ -0,0 +1,56 @@
 /*******************************************************************************
 Copyright (c) 2017 Rachael Alexanderson
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright notice,
 this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
 this list of conditions and the following disclaimer in the documentation
 and/or other materials provided with the distribution.
 3. Neither the name of the copyright holder nor the names of its contributors
 may be used to endorse or promote products derived from this software
 without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 ================================================================================
 Simple In-Line Assembly support for ARM
 The purpose of this file is to provide SSE2 translations to ARM NEON. Note
 that this requires a special GCC flag (which should automatically be set by
 CMAKE).
 As stated in the other included file - this will never be as fast as remaking
 the SSE2 instructions from the ground up. But since I have no interest in
 doing so, myself - have at it if you want proper ARM support!
 For Raspberry Pi, note that this requires RPI2 or higher.
 *******************************************************************************/
 #if defined(__ARM_NEON__)
 #include "ila/sse_to_neon.hpp"
 // credit to Magnus Norddahl
 inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
    return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
 }
 #else
 // Just standard old Intel!
 #include <emmintrin.h>
 #endif
--- a/src/ila/sse_to_neon.hpp
+++ b/src/ila/sse_to_neon.hpp
@ -0,0 +1,187 @@
 //
 //  sse_to_neon.hpp
 //  neon_test
 //
 //  Created by Tim Oberhauser on 11/16/13.
 //  Copyright (c) 2013 Tim Oberhauser. All rights reserved.
 //
 #ifndef neon_test_sse_to_neon_hpp
 #define neon_test_sse_to_neon_hpp
 #include <arm_neon.h>
 #if defined(__MM_MALLOC_H)
 // copied from mm_malloc.h {
 #include <stdlib.h>
 /* We can't depend on <stdlib.h> since the prototype of posix_memalign
 may not be visible.  */
 #ifndef __cplusplus
 extern int posix_memalign (void **, size_t, size_t);
 #else
 extern "C" int posix_memalign (void **, size_t, size_t) throw ();
 #endif
 static __inline void *
 _mm_malloc (size_t size, size_t alignment)
 {
    void *ptr;
    if (alignment == 1)
        return malloc (size);
    if (alignment == 2 || (sizeof (void *) == 8 && alignment == 4))
        alignment = sizeof (void *);
    if (posix_memalign (&ptr, alignment, size) == 0)
        return ptr;
    else
        return NULL;
 }
 static __inline void
 _mm_free (void * ptr)
 {
    free (ptr);
 }
 // } copied from mm_malloc.h
 #endif
 typedef int16x8_t __m128i;
 typedef float32x4_t __m128;
 // ADDITION
 inline __m128i _mm_add_epi16(const __m128i& a, const __m128i& b){
    return vaddq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
 }
 inline __m128 _mm_add_ps(const __m128& a, const __m128& b){
    return vaddq_f32(a,b);
 }
 // SUBTRACTION
 inline __m128i _mm_sub_epi16(const __m128i& a, const __m128i& b){
    return vsubq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
 }
 inline __m128 _mm_sub_ps(const __m128& a, const __m128& b){
    return vsubq_f32(a,b);
 }
 // MULTIPLICATION
 #if 0
 inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
    return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
 }
 #endif
 inline __m128 _mm_mul_ps(const __m128& a, const __m128& b){
    return vmulq_f32(a,b);
 }
 // SET VALUE
 inline __m128i _mm_set1_epi16(const int16_t w){
    return vmovq_n_s16(w);
 }
 inline __m128i _mm_setzero_si128(){
    return vmovq_n_s16(0);
 }
 inline __m128 _mm_set1_ps(const float32_t& w){
    return vmovq_n_f32(w);
 }
 // STORE
 inline void _mm_storeu_si128(__m128i* p, __m128i& a){
    vst1q_s16(reinterpret_cast<int16_t*>(p),reinterpret_cast<int16x8_t>(a));
 }
 inline void _mm_store_ps(float32_t* p, __m128&a){
    vst1q_f32(p,a);
 }
 // LOAD
 inline __m128i _mm_loadu_si128(__m128i* p){//For SSE address p does not need be 16-byte aligned
    return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
 }
 inline __m128i _mm_load_si128(__m128i* p){//For SSE address p must be 16-byte aligned
    return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
 }
 inline __m128 _mm_load_ps(const float32_t* p){
    return reinterpret_cast<__m128>(vld1q_f32(p));
 }
 // SHIFT OPERATIONS
 inline __m128i _mm_srai_epi16(const __m128i& a, const int count){
    int16x8_t b = vmovq_n_s16(-count);
    return reinterpret_cast<__m128i>(vshlq_s16(a,b));
    //    return vrshrq_n_s16(a, count);// TODO Argument to '__builtin_neon_vrshrq_n_v' must be a constant integer
 }
 // MIN/MAX OPERATIONS
 inline __m128 _mm_max_ps(const __m128& a, const __m128& b){
    return reinterpret_cast<__m128>(vmaxq_f32(reinterpret_cast<float32x4_t>(a),reinterpret_cast<float32x4_t>(b)));
 }
 // SINGLE ELEMENT ACCESS
 inline int16_t _mm_extract_epi16(__m128i& a, int index){
    return (reinterpret_cast<int16_t*>(&a))[index];
    //    return vgetq_lane_s16(a,index);// TODO Argument to '__builtin_neon_vgetq_lane_i16' must be a constant integer
 }
 // MISCELLANOUS
 inline __m128i _mm_sad_epu8 (__m128i a, __m128i b){
    uint64x2_t sad = reinterpret_cast<uint64x2_t>(vabdq_u8(reinterpret_cast<uint8x16_t>(a),reinterpret_cast<uint8x16_t>(b)));
    sad = reinterpret_cast<uint64x2_t>(vpaddlq_u8(reinterpret_cast<uint8x16_t>(sad)));
    sad = reinterpret_cast<uint64x2_t>(vpaddlq_u16(reinterpret_cast<uint16x8_t>(sad)));
    sad = vpaddlq_u32(reinterpret_cast<uint32x4_t>(sad));
    return reinterpret_cast<__m128i>(sad);
 }
 // LOGICAL OPERATIONS
 inline __m128 _mm_and_ps(__m128& a, __m128& b){
    return reinterpret_cast<__m128>(vandq_u32(reinterpret_cast<uint32x4_t>(a),reinterpret_cast<uint32x4_t>(b)));
 }
 // CONVERSIONS
 inline __m128i _mm_packus_epi16 (const __m128i a, const __m128i b){
    __m128i result = _mm_setzero_si128();
    int8x8_t* a_narrow = reinterpret_cast<int8x8_t*>(&result);
    int8x8_t* b_narrow = &a_narrow[1];
    *a_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(a));
    *b_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(b));
    return result;
 }
 // In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
 inline __m128i _mm_unpacklo_epi8(__m128i a, const __m128i dummy_zero){
    // dummy_zero is a dummy variable
    uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
    return reinterpret_cast<__m128i>(vmovl_u8(*a_low));
 }
 // In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
 inline __m128i _mm_unpackhi_epi8(__m128i a, const __m128i dummy_zero){
    // dummy_zero is a dummy variable
    uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
    return reinterpret_cast<__m128i>(vmovl_u8(a_low[1]));
 }
 #endif
--- a/src/swrenderer/drawers/r_draw_rgba.h
+++ b/src/swrenderer/drawers/r_draw_rgba.h
@ -36,6 +36,8 @@
 #ifndef NO_SSE
 #include <immintrin.h>
 #else
 #include "ila/ila.h"
 #endif
 struct FSpecialColormap;