23 #ifndef SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ 24 #define SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ 28 #if defined(USE_SSE4_1) 29 #include <smmintrin.h> 34 template<u
int32_t HBFilterOrder>
39 int32_t samples[HBFilterOrder][2],
43 #if defined(USE_SSE4_1) 47 __m128i sum = _mm_setzero_si128();
51 for (
int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 16;
i++)
53 shh = _mm_set_epi32(h[4*
i], h[4*i], h[4*i], h[4*i]);
54 sa = _mm_load_si128((__m128i*) &(samples[a][0]));
55 sb = _mm_load_si128((__m128i*) &(samples[b][0]));
56 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
59 shh = _mm_set_epi32(h[4*i+1], h[4*i+1], h[4*i+1], h[4*i+1]);
60 sa = _mm_load_si128((__m128i*) &(samples[a][0]));
61 sb = _mm_load_si128((__m128i*) &(samples[b][0]));
62 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
65 shh = _mm_set_epi32(h[4*i+2], h[4*i+2], h[4*i+2], h[4*i+2]);
66 sa = _mm_load_si128((__m128i*) &(samples[a][0]));
67 sb = _mm_load_si128((__m128i*) &(samples[b][0]));
68 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
71 shh = _mm_set_epi32(h[4*i+3], h[4*i+3], h[4*i+3], h[4*i+3]);
72 sa = _mm_load_si128((__m128i*) &(samples[a][0]));
73 sb = _mm_load_si128((__m128i*) &(samples[b][0]));
74 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
80 _mm_store_si128((__m128i*) sums, sum);
91 int32_t samples[HBFilterOrder*2][2],
95 #if defined(USE_SSE4_1) 99 __m128i sum = _mm_setzero_si128();
103 for (
int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 16;
i++)
105 shh = _mm_set_epi32(h[4*
i], h[4*i], h[4*i], h[4*i]);
106 sa = _mm_loadu_si128((__m128i*) &(samples[a][0]));
107 sb = _mm_loadu_si128((__m128i*) &(samples[b][0]));
108 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
111 shh = _mm_set_epi32(h[4*i+1], h[4*i+1], h[4*i+1], h[4*i+1]);
112 sa = _mm_loadu_si128((__m128i*) &(samples[a][0]));
113 sb = _mm_loadu_si128((__m128i*) &(samples[b][0]));
114 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
117 shh = _mm_set_epi32(h[4*i+2], h[4*i+2], h[4*i+2], h[4*i+2]);
118 sa = _mm_loadu_si128((__m128i*) &(samples[a][0]));
119 sb = _mm_loadu_si128((__m128i*) &(samples[b][0]));
120 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
123 shh = _mm_set_epi32(h[4*i+3], h[4*i+3], h[4*i+3], h[4*i+3]);
124 sa = _mm_loadu_si128((__m128i*) &(samples[a][0]));
125 sb = _mm_loadu_si128((__m128i*) &(samples[b][0]));
126 sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh));
132 _mm_store_si128((__m128i*) sums, sum);
static void workNA(int ptr, int32_t samples[HBFilterOrder *2][2], int32_t &iEvenAcc, int32_t &qEvenAcc, int32_t &iOddAcc, int32_t &qOddAcc)
static void work(int32_t samples[HBFilterOrder][2], int32_t &iEvenAcc, int32_t &qEvenAcc, int32_t &iOddAcc, int32_t &qOddAcc)