- started adding ARM support. incomplete. won't compile. don't try.

2025-04-13 11:50:58 +00:00 · 2017-03-04 16:50:42 -05:00 · 2017-03-04 16:50:42 -05:00 · be8abba344
commit be8abba344
parent 09530e9496
5 changed files with 1448 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -502,6 +502,11 @@ if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE )
 		set( CMAKE_EXE_LINKER_FLAGS "-stdlib=libc++ ${CMAKE_EXE_LINKER_FLAGS}" )
 	endif ()

+	# ARM processors (Raspberry Pi) - enable ARM NEON support.
+	if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") 
+		set( CMAKE_CXX_FLAGS "-mfpu=neon ${CMAKE_CXX_FLAGS}" )
+	endif ()
+
 	# Remove extra warnings when using the official DirectX headers.
 	# Also, TDM-GCC 4.4.0 no longer accepts glibc-style printf formats as valid,
 	# which is a royal pain. The previous version I had been using was fine with them.
--- a/src/ila/SSE2NEON.h
+++ b/src/ila/SSE2NEON.h
--- a/src/ila/ila.h
+++ b/src/ila/ila.h
@ -0,0 +1,56 @@
+/*******************************************************************************
+Copyright (c) 2017 Rachael Alexanderson
+
+Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+================================================================================
+
+Simple In-Line Assembly support for ARM
+
+The purpose of this file is to provide SSE2 translations to ARM NEON. Note
+ that this requires a special GCC flag (which should automatically be set by
+ CMAKE).
+
+As stated in the other included file - this will never be as fast as remaking
+ the SSE2 instructions from the ground up. But since I have no interest in
+ doing so, myself - have at it if you want proper ARM support!
+
+For Raspberry Pi, note that this requires RPI2 or higher.
+
+*******************************************************************************/
+
+#if defined(__ARM_NEON__)
+#include "ila/sse_to_neon.hpp"
+// credit to Magnus Norddahl
+inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
+    return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
+}
+#else
+// Just standard old Intel!
+#include <emmintrin.h>
+#endif
+
--- a/src/ila/sse_to_neon.hpp
+++ b/src/ila/sse_to_neon.hpp
@ -0,0 +1,187 @@
+//
+//  sse_to_neon.hpp
+//  neon_test
+//
+//  Created by Tim Oberhauser on 11/16/13.
+//  Copyright (c) 2013 Tim Oberhauser. All rights reserved.
+//
+
+#ifndef neon_test_sse_to_neon_hpp
+#define neon_test_sse_to_neon_hpp
+
+#include <arm_neon.h>
+
+#if defined(__MM_MALLOC_H)
+// copied from mm_malloc.h {
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+ may not be visible.  */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t size, size_t alignment)
+{
+    void *ptr;
+    if (alignment == 1)
+        return malloc (size);
+    if (alignment == 2 || (sizeof (void *) == 8 && alignment == 4))
+        alignment = sizeof (void *);
+    if (posix_memalign (&ptr, alignment, size) == 0)
+        return ptr;
+    else
+        return NULL;
+}
+
+static __inline void
+_mm_free (void * ptr)
+{
+    free (ptr);
+}
+// } copied from mm_malloc.h
+#endif
+
+
+typedef int16x8_t __m128i;
+typedef float32x4_t __m128;
+
+
+// ADDITION
+inline __m128i _mm_add_epi16(const __m128i& a, const __m128i& b){
+    return vaddq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
+}
+
+inline __m128 _mm_add_ps(const __m128& a, const __m128& b){
+    return vaddq_f32(a,b);
+}
+
+
+// SUBTRACTION
+inline __m128i _mm_sub_epi16(const __m128i& a, const __m128i& b){
+    return vsubq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
+}
+
+inline __m128 _mm_sub_ps(const __m128& a, const __m128& b){
+    return vsubq_f32(a,b);
+}
+
+
+// MULTIPLICATION
+#if 0
+inline __m128i _mm_mullo_epi16(const __m128i& a, const __m128i& b){
+    return vqrdmulhq_s16(reinterpret_cast<int16x8_t>(a),reinterpret_cast<int16x8_t>(b));
+}
+#endif
+
+inline __m128 _mm_mul_ps(const __m128& a, const __m128& b){
+    return vmulq_f32(a,b);
+}
+
+
+// SET VALUE
+inline __m128i _mm_set1_epi16(const int16_t w){
+    return vmovq_n_s16(w);
+}
+
+inline __m128i _mm_setzero_si128(){
+    return vmovq_n_s16(0);
+}
+
+inline __m128 _mm_set1_ps(const float32_t& w){
+    return vmovq_n_f32(w);
+}
+
+
+// STORE
+inline void _mm_storeu_si128(__m128i* p, __m128i& a){
+    vst1q_s16(reinterpret_cast<int16_t*>(p),reinterpret_cast<int16x8_t>(a));
+}
+
+inline void _mm_store_ps(float32_t* p, __m128&a){
+    vst1q_f32(p,a);
+}
+
+
+// LOAD
+inline __m128i _mm_loadu_si128(__m128i* p){//For SSE address p does not need be 16-byte aligned
+    return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
+}
+
+inline __m128i _mm_load_si128(__m128i* p){//For SSE address p must be 16-byte aligned
+    return reinterpret_cast<__m128i>(vld1q_s16(reinterpret_cast<int16_t*>(p)));
+}
+
+inline __m128 _mm_load_ps(const float32_t* p){
+    return reinterpret_cast<__m128>(vld1q_f32(p));
+}
+
+
+// SHIFT OPERATIONS
+inline __m128i _mm_srai_epi16(const __m128i& a, const int count){
+    int16x8_t b = vmovq_n_s16(-count);
+    return reinterpret_cast<__m128i>(vshlq_s16(a,b));
+    //    return vrshrq_n_s16(a, count);// TODO Argument to '__builtin_neon_vrshrq_n_v' must be a constant integer
+}
+
+
+// MIN/MAX OPERATIONS
+inline __m128 _mm_max_ps(const __m128& a, const __m128& b){
+    return reinterpret_cast<__m128>(vmaxq_f32(reinterpret_cast<float32x4_t>(a),reinterpret_cast<float32x4_t>(b)));
+}
+
+
+// SINGLE ELEMENT ACCESS
+inline int16_t _mm_extract_epi16(__m128i& a, int index){
+    return (reinterpret_cast<int16_t*>(&a))[index];
+    //    return vgetq_lane_s16(a,index);// TODO Argument to '__builtin_neon_vgetq_lane_i16' must be a constant integer
+}
+
+
+// MISCELLANOUS
+inline __m128i _mm_sad_epu8 (__m128i a, __m128i b){
+    uint64x2_t sad = reinterpret_cast<uint64x2_t>(vabdq_u8(reinterpret_cast<uint8x16_t>(a),reinterpret_cast<uint8x16_t>(b)));
+    sad = reinterpret_cast<uint64x2_t>(vpaddlq_u8(reinterpret_cast<uint8x16_t>(sad)));
+    sad = reinterpret_cast<uint64x2_t>(vpaddlq_u16(reinterpret_cast<uint16x8_t>(sad)));
+    sad = vpaddlq_u32(reinterpret_cast<uint32x4_t>(sad));
+    return reinterpret_cast<__m128i>(sad);
+}
+
+
+// LOGICAL OPERATIONS
+inline __m128 _mm_and_ps(__m128& a, __m128& b){
+    return reinterpret_cast<__m128>(vandq_u32(reinterpret_cast<uint32x4_t>(a),reinterpret_cast<uint32x4_t>(b)));
+}
+
+
+// CONVERSIONS
+inline __m128i _mm_packus_epi16 (const __m128i a, const __m128i b){
+    __m128i result = _mm_setzero_si128();
+    int8x8_t* a_narrow = reinterpret_cast<int8x8_t*>(&result);
+    int8x8_t* b_narrow = &a_narrow[1];
+    *a_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(a));
+    *b_narrow = reinterpret_cast<int8x8_t>(vqmovun_s16(b));
+    return result;
+}
+
+// In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
+inline __m128i _mm_unpacklo_epi8(__m128i a, const __m128i dummy_zero){
+    // dummy_zero is a dummy variable
+    uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
+    return reinterpret_cast<__m128i>(vmovl_u8(*a_low));
+}
+
+// In my case this function was only needed to convert 8 bit to 16 bit integers by extending with zeros, the general case is not implemented!!!
+inline __m128i _mm_unpackhi_epi8(__m128i a, const __m128i dummy_zero){
+    // dummy_zero is a dummy variable
+    uint8x8_t* a_low = reinterpret_cast<uint8x8_t*>(&a);
+    return reinterpret_cast<__m128i>(vmovl_u8(a_low[1]));
+}
+
+
+
+
+#endif
--- a/src/swrenderer/drawers/r_draw_rgba.h
+++ b/src/swrenderer/drawers/r_draw_rgba.h
@ -36,6 +36,8 @@

 #ifndef NO_SSE
 #include <immintrin.h>
+#else
+#include "ila/ila.h"
 #endif

 struct FSpecialColormap;