/////////////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2021 Edouard Griffiths, F4EXB // // // // This program is free software; you can redistribute it and/or modify // // it under the terms of the GNU General Public License as published by // // the Free Software Foundation as version 3 of the License, or // // (at your option) any later version. // // // // This program is distributed in the hope that it will be useful, // // but WITHOUT ANY WARRANTY; without even the implied warranty of // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // // GNU General Public License V3 for more details. // // // // You should have received a copy of the GNU General Public License // // along with this program. If not, see . // /////////////////////////////////////////////////////////////////////////////////////// /* Intel AVX2 acceleration Copyright 2018 Ahmet Inan */ #ifndef AVX2_HH #define AVX2_HH #include #include "simd.h" namespace ldpctool { template <> union SIMD { static const int SIZE = 8; typedef float value_type; typedef uint32_t uint_type; __m256 m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef double value_type; typedef uint64_t uint_type; __m256d m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 32; typedef int8_t value_type; typedef uint8_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef int16_t value_type; typedef uint16_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef int32_t value_type; typedef uint32_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef int64_t value_type; typedef uint64_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 32; typedef uint8_t value_type; typedef uint8_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef uint16_t value_type; typedef uint16_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef uint32_t value_type; typedef uint32_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef uint64_t value_type; typedef uint64_t uint_type; __m256i m; value_type v[SIZE]; uint_type u[SIZE]; }; template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256d)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m = (__m256i)a.m; return tmp; } template <> inline SIMD vdup>(float a) { SIMD tmp; tmp.m = _mm256_set1_ps(a); return tmp; } template <> inline SIMD vdup>(double a) { SIMD tmp; tmp.m = _mm256_set1_pd(a); return tmp; } template <> inline SIMD vdup>(int8_t a) { SIMD tmp; tmp.m = _mm256_set1_epi8(a); return tmp; } template <> inline SIMD vdup>(int16_t a) { SIMD tmp; tmp.m = _mm256_set1_epi16(a); return tmp; } template <> inline SIMD vdup>(int32_t a) { SIMD tmp; tmp.m = _mm256_set1_epi32(a); return tmp; } template <> inline SIMD vdup>(int64_t a) { SIMD tmp; tmp.m = _mm256_set1_epi64x(a); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm256_setzero_ps(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm256_setzero_pd(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_add_ps(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_add_pd(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_add_epi8(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_add_epi16(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_add_epi32(a.m, b.m); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_add_epi64(a.m, b.m); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_adds_epi8(a.m, b.m); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_adds_epi16(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sub_ps(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sub_pd(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sub_epi8(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sub_epi16(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sub_epi32(a.m, b.m); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sub_epi64(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_subs_epi8(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_subs_epi16(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_subs_epu8(a.m, b.m); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_subs_epu16(a.m, b.m); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; tmp.m = _mm256_andnot_ps(_mm256_set1_ps(-0.f), a.m); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; tmp.m = _mm256_andnot_pd(_mm256_set1_pd(-0.), a.m); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m = _mm256_abs_epi8(_mm256_max_epi8(a.m, _mm256_set1_epi8(-INT8_MAX))); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m = _mm256_abs_epi16(_mm256_max_epi16(a.m, _mm256_set1_epi16(-INT16_MAX))); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m = _mm256_abs_epi32(_mm256_max_epi32(a.m, _mm256_set1_epi32(-INT32_MAX))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_andnot_ps( _mm256_cmp_ps(b.m, _mm256_setzero_ps(), _CMP_EQ_OQ), _mm256_xor_ps(a.m, _mm256_and_ps(_mm256_set1_ps(-0.f), b.m))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_andnot_pd( _mm256_cmp_pd(b.m, _mm256_setzero_pd(), _CMP_EQ_OQ), _mm256_xor_pd(a.m, _mm256_and_pd(_mm256_set1_pd(-0.), b.m))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sign_epi8(a.m, b.m); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sign_epi16(a.m, b.m); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_sign_epi32(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_or_si256(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_or_si256(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_or_si256(a.m, b.m); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_or_si256(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_and_si256(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_and_si256(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_and_si256(a.m, b.m); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_and_si256(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_xor_si256(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_xor_si256(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_xor_si256(a.m, b.m); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_xor_si256(a.m, b.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_andnot_si256(b.m, a.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_andnot_si256(b.m, a.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_andnot_si256(b.m, a.m); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_andnot_si256(b.m, a.m); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpeq_epi8(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpeq_epi16(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpeq_epi32(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpeq_epi64(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_pd(a.m, b.m, _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_cmpeq_epi8(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_cmpeq_epi16(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_cmpeq_epi32(a.m, b.m); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_cmpeq_epi64(a.m, b.m); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_GT_OQ); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_GT_OQ); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi8(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi16(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi32(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi64(a.m, _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_LT_OQ); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = (__m256i)_mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_LT_OQ); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi8(_mm256_setzero_si256(), a.m); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi16(_mm256_setzero_si256(), a.m); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi32(_mm256_setzero_si256(), a.m); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_min_ps(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_min_pd(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_min_epi8(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_min_epi16(a.m, b.m); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_min_epi32(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_max_ps(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_max_pd(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_max_epi8(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_max_epi16(a.m, b.m); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m = _mm256_max_epi32(a.m, b.m); return tmp; } } // namespace ldpctool #endif