44 #if defined(USE_SSE4_1) 48 __m128i sumI = _mm_setzero_si128();
49 __m128i sumQ = _mm_setzero_si128();
53 for (
int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 16;
i++)
57 sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(even[0][a])), _MM_SHUFFLE(0,1,2,3));
58 sb = _mm_loadu_si128((__m128i*) &(even[0][b]));
59 sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h));
61 sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(even[1][a])), _MM_SHUFFLE(0,1,2,3));
62 sb = _mm_loadu_si128((__m128i*) &(even[1][b]));
63 sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h));
67 sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(odd[0][a])), _MM_SHUFFLE(0,1,2,3));
68 sb = _mm_loadu_si128((__m128i*) &(odd[0][b]));
69 sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h));
71 sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(odd[1][a])), _MM_SHUFFLE(0,1,2,3));
72 sb = _mm_loadu_si128((__m128i*) &(odd[1][b]));
73 sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h));
83 sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8));
84 sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4));
85 iAcc = _mm_cvtsi128_si32(sumI);
87 sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8));
88 sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4));
89 qAcc = _mm_cvtsi128_si32(sumQ);