00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
#include <xmmintrin.h>
00034
00035 void filter_mem2_10(
const float *x,
const float *_num,
const float *_den,
float *y,
int N,
int ord,
float *_mem)
00036 {
00037 __m128 num[3], den[3], mem[3];
00038
00039
int i;
00040
00041
00042
for (i=0;i<2;i++)
00043 {
00044 mem[i] = _mm_loadu_ps(_mem+4*i);
00045 num[i] = _mm_loadu_ps(_num+4*i+1);
00046 den[i] = _mm_loadu_ps(_den+4*i+1);
00047 }
00048 mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00049 num[2] = _mm_setr_ps(_num[9], _num[10], 0, 0);
00050 den[2] = _mm_setr_ps(_den[9], _den[10], 0, 0);
00051
00052
for (i=0;i<N;i++)
00053 {
00054 __m128 xx;
00055 __m128 yy;
00056
00057 xx = _mm_load_ps1(x+i);
00058 yy = _mm_add_ss(xx, mem[0]);
00059 _mm_store_ss(y+i, yy);
00060 yy = _mm_shuffle_ps(yy, yy, 0);
00061
00062
00063 mem[0] = _mm_move_ss(mem[0], mem[1]);
00064 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00065
00066 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00067 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00068
00069 mem[1] = _mm_move_ss(mem[1], mem[2]);
00070 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00071
00072 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00073 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00074
00075 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00076
00077 mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00078 mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00079 }
00080
00081 _mm_storeu_ps(_mem, mem[0]);
00082 _mm_storeu_ps(_mem+4, mem[1]);
00083 _mm_store_ss(_mem+8, mem[2]);
00084 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00085 _mm_store_ss(_mem+9, mem[2]);
00086 }
00087
00088 void filter_mem2_8(
const float *x,
const float *_num,
const float *_den,
float *y,
int N,
int ord,
float *_mem)
00089 {
00090 __m128 num[2], den[2], mem[2];
00091
00092
int i;
00093
00094
00095
for (i=0;i<2;i++)
00096 {
00097 mem[i] = _mm_loadu_ps(_mem+4*i);
00098 num[i] = _mm_loadu_ps(_num+4*i+1);
00099 den[i] = _mm_loadu_ps(_den+4*i+1);
00100 }
00101
00102
for (i=0;i<N;i++)
00103 {
00104 __m128 xx;
00105 __m128 yy;
00106
00107 xx = _mm_load_ps1(x+i);
00108 yy = _mm_add_ss(xx, mem[0]);
00109 _mm_store_ss(y+i, yy);
00110 yy = _mm_shuffle_ps(yy, yy, 0);
00111
00112
00113 mem[0] = _mm_move_ss(mem[0], mem[1]);
00114 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00115
00116 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00117 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00118
00119 mem[1] = _mm_sub_ss(mem[1], mem[1]);
00120 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00121
00122 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00123 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00124 }
00125
00126 _mm_storeu_ps(_mem, mem[0]);
00127 _mm_storeu_ps(_mem+4, mem[1]);
00128 }
00129
00130
00131
00132 void filter_mem2(
const float *x,
const float *_num,
const float *_den,
float *y,
int N,
int ord,
float *_mem)
00133 {
00134
if(ord==10)
00135
filter_mem2_10(x, _num, _den, y, N, ord, _mem);
00136
else if (ord==8)
00137
filter_mem2_8(x, _num, _den, y, N, ord, _mem);
00138 }
00139
00140
00141
00142 void iir_mem2_10(
const float *x,
const float *_den,
float *y,
int N,
int ord,
float *_mem)
00143 {
00144 __m128 den[3], mem[3];
00145
00146
int i;
00147
00148
00149
for (i=0;i<2;i++)
00150 {
00151 mem[i] = _mm_loadu_ps(_mem+4*i);
00152 den[i] = _mm_loadu_ps(_den+4*i+1);
00153 }
00154 mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00155 den[2] = _mm_setr_ps(_den[9], _den[10], 0, 0);
00156
00157
for (i=0;i<N;i++)
00158 {
00159 __m128 xx;
00160 __m128 yy;
00161
00162 xx = _mm_load_ps1(x+i);
00163 yy = _mm_add_ss(xx, mem[0]);
00164 _mm_store_ss(y+i, yy);
00165 yy = _mm_shuffle_ps(yy, yy, 0);
00166
00167
00168 mem[0] = _mm_move_ss(mem[0], mem[1]);
00169 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00170
00171 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00172
00173 mem[1] = _mm_move_ss(mem[1], mem[2]);
00174 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00175
00176 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00177
00178 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00179
00180 mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00181 }
00182
00183 _mm_storeu_ps(_mem, mem[0]);
00184 _mm_storeu_ps(_mem+4, mem[1]);
00185 _mm_store_ss(_mem+8, mem[2]);
00186 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00187 _mm_store_ss(_mem+9, mem[2]);
00188 }
00189
00190
00191 void iir_mem2_8(
const float *x,
const float *_den,
float *y,
int N,
int ord,
float *_mem)
00192 {
00193 __m128 den[2], mem[2];
00194
00195
int i;
00196
00197
00198
for (i=0;i<2;i++)
00199 {
00200 mem[i] = _mm_loadu_ps(_mem+4*i);
00201 den[i] = _mm_loadu_ps(_den+4*i+1);
00202 }
00203
00204
for (i=0;i<N;i++)
00205 {
00206 __m128 xx;
00207 __m128 yy;
00208
00209 xx = _mm_load_ps1(x+i);
00210 yy = _mm_add_ss(xx, mem[0]);
00211 _mm_store_ss(y+i, yy);
00212 yy = _mm_shuffle_ps(yy, yy, 0);
00213
00214
00215 mem[0] = _mm_move_ss(mem[0], mem[1]);
00216 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00217
00218 mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00219
00220 mem[1] = _mm_sub_ss(mem[1], mem[1]);
00221 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00222
00223 mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00224 }
00225
00226 _mm_storeu_ps(_mem, mem[0]);
00227 _mm_storeu_ps(_mem+4, mem[1]);
00228 }
00229
00230 void iir_mem2(
const float *x,
const float *_den,
float *y,
int N,
int ord,
float *_mem)
00231 {
00232
if(ord==10)
00233
iir_mem2_10(x, _den, y, N, ord, _mem);
00234
else if (ord==8)
00235
iir_mem2_8(x, _den, y, N, ord, _mem);
00236 }
00237
00238
00239 void fir_mem2_10(
const float *x,
const float *_num,
float *y,
int N,
int ord,
float *_mem)
00240 {
00241 __m128 num[3], mem[3];
00242
00243
int i;
00244
00245
00246
for (i=0;i<2;i++)
00247 {
00248 mem[i] = _mm_loadu_ps(_mem+4*i);
00249 num[i] = _mm_loadu_ps(_num+4*i+1);
00250 }
00251 mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00252 num[2] = _mm_setr_ps(_num[9], _num[10], 0, 0);
00253
00254
for (i=0;i<N;i++)
00255 {
00256 __m128 xx;
00257 __m128 yy;
00258
00259 xx = _mm_load_ps1(x+i);
00260 yy = _mm_add_ss(xx, mem[0]);
00261 _mm_store_ss(y+i, yy);
00262 yy = _mm_shuffle_ps(yy, yy, 0);
00263
00264
00265 mem[0] = _mm_move_ss(mem[0], mem[1]);
00266 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00267
00268 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00269
00270 mem[1] = _mm_move_ss(mem[1], mem[2]);
00271 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00272
00273 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00274
00275 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00276
00277 mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00278 }
00279
00280 _mm_storeu_ps(_mem, mem[0]);
00281 _mm_storeu_ps(_mem+4, mem[1]);
00282 _mm_store_ss(_mem+8, mem[2]);
00283 mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00284 _mm_store_ss(_mem+9, mem[2]);
00285 }
00286
00287 void fir_mem2_8(
const float *x,
const float *_num,
float *y,
int N,
int ord,
float *_mem)
00288 {
00289 __m128 num[2], mem[2];
00290
00291
int i;
00292
00293
00294
for (i=0;i<2;i++)
00295 {
00296 mem[i] = _mm_loadu_ps(_mem+4*i);
00297 num[i] = _mm_loadu_ps(_num+4*i+1);
00298 }
00299
00300
for (i=0;i<N;i++)
00301 {
00302 __m128 xx;
00303 __m128 yy;
00304
00305 xx = _mm_load_ps1(x+i);
00306 yy = _mm_add_ss(xx, mem[0]);
00307 _mm_store_ss(y+i, yy);
00308 yy = _mm_shuffle_ps(yy, yy, 0);
00309
00310
00311 mem[0] = _mm_move_ss(mem[0], mem[1]);
00312 mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00313
00314 mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00315
00316 mem[1] = _mm_sub_ss(mem[1], mem[1]);
00317 mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00318
00319 mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00320 }
00321
00322 _mm_storeu_ps(_mem, mem[0]);
00323 _mm_storeu_ps(_mem+4, mem[1]);
00324 }
00325
00326
00327 void fir_mem2(
const float *x,
const float *_num,
float *y,
int N,
int ord,
float *_mem)
00328 {
00329
if(ord==10)
00330
fir_mem2_10(x, _num, y, N, ord, _mem);
00331
else if (ord==8)
00332
fir_mem2_8(x, _num, y, N, ord, _mem);
00333 }