/* Intel SSE4.1 acceleration Copyright 2018 Ahmet Inan */ #ifndef SSE4_1_HH #define SSE4_1_HH #include #include #include "simd.h" namespace ldpctool { template <> union SIMD { static const int SIZE = 4; typedef float value_type; typedef uint32_t uint_type; __m128 m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 2; typedef double value_type; typedef uint64_t uint_type; __m128d m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef int8_t value_type; typedef uint8_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef int16_t value_type; typedef uint16_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef int32_t value_type; typedef uint32_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 2; typedef int64_t value_type; typedef uint64_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef uint8_t value_type; typedef uint8_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef uint16_t value_type; typedef uint16_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef uint32_t value_type; typedef uint32_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 2; typedef uint64_t value_type; typedef uint64_t uint_type; __m128i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128d)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m128i)a.m; return tmp; } template <> inline SIMD vdup>(float a) { SIMD tmp; tmp.m = _mm_set1_ps(a); return tmp; } template <> inline SIMD vdup>(double a) { SIMD tmp; tmp.m = _mm_set1_pd(a); return tmp; } template <> inline SIMD vdup>(int8_t a) { SIMD tmp; tmp.m = _mm_set1_epi8(a); return tmp; } template <> inline SIMD vdup>(int16_t a) { SIMD tmp; tmp.m = _mm_set1_epi16(a); return tmp; } template <> inline SIMD vdup>(int32_t a) { SIMD tmp; tmp.m = _mm_set1_epi32(a); return tmp; } template <> inline SIMD vdup>(int64_t a) { SIMD tmp; tmp.m = _mm_set1_epi64x(a); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm_setzero_ps(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm_setzero_pd(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm_setzero_si128(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm_setzero_si128(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm_setzero_si128(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm_setzero_si128(); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_add_ps(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_add_pd(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_add_epi8(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_add_epi16(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_add_epi32(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_add_epi64(a.m, b.m); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_adds_epi8(a.m, b.m); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_adds_epi16(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sub_ps(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sub_pd(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sub_epi8(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sub_epi16(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sub_epi32(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sub_epi64(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_subs_epi8(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_subs_epi16(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_subs_epu8(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_subs_epu16(a.m, b.m); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; tmp.m = _mm_andnot_ps(_mm_set1_ps(-0.f), a.m); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; tmp.m = _mm_andnot_pd(_mm_set1_pd(-0.), a.m); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m = _mm_abs_epi8(_mm_max_epi8(a.m, _mm_set1_epi8(-INT8_MAX))); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m = _mm_abs_epi16(_mm_max_epi16(a.m, _mm_set1_epi16(-INT16_MAX))); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m = _mm_abs_epi32(_mm_max_epi32(a.m, _mm_set1_epi32(-INT32_MAX))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_andnot_ps( _mm_cmpeq_ps(b.m, _mm_setzero_ps()), _mm_xor_ps(a.m, _mm_and_ps(_mm_set1_ps(-0.f), b.m))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_andnot_pd( _mm_cmpeq_pd(b.m, _mm_setzero_pd()), _mm_xor_pd(a.m, _mm_and_pd(_mm_set1_pd(-0.), b.m))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sign_epi8(a.m, b.m); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sign_epi16(a.m, b.m); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_sign_epi32(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_or_si128(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_or_si128(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_or_si128(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_or_si128(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_and_si128(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_and_si128(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_and_si128(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_and_si128(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_xor_si128(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_xor_si128(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_xor_si128(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_xor_si128(a.m, b.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_andnot_si128(b.m, a.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_andnot_si128(b.m, a.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_andnot_si128(b.m, a.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_andnot_si128(b.m, a.m); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = (__m128i)_mm_cmpeq_ps(a.m, _mm_setzero_ps()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = (__m128i)_mm_cmpeq_pd(a.m, _mm_setzero_pd()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpeq_epi8(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpeq_epi16(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpeq_epi32(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpeq_epi64(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = (__m128i)_mm_cmpeq_ps(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = (__m128i)_mm_cmpeq_pd(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_cmpeq_epi8(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_cmpeq_epi16(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_cmpeq_epi32(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_cmpeq_epi64(a.m, b.m); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = (__m128i)_mm_cmpgt_ps(a.m, _mm_setzero_ps()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = (__m128i)_mm_cmpgt_pd(a.m, _mm_setzero_pd()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi8(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi16(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi32(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi64(a.m, _mm_setzero_si128()); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = (__m128i)_mm_cmplt_ps(a.m, _mm_setzero_ps()); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = (__m128i)_mm_cmplt_pd(a.m, _mm_setzero_pd()); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi8(_mm_setzero_si128(), a.m); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi16(_mm_setzero_si128(), a.m); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi32(_mm_setzero_si128(), a.m); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm_cmpgt_epi64(_mm_setzero_si128(), a.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_min_ps(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_min_pd(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_min_epi8(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_min_epi16(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_min_epi32(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_max_ps(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_max_pd(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_max_epi8(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_max_epi16(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm_max_epi32(a.m, b.m); return tmp; } } // namespace ldpctool #endif