00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
#include <xmmintrin.h>
00034
00035
static float inner_prod(
const float *a,
const float *b,
int len)
00036 {
00037
int i;
00038
float ret;
00039 __m128 sum = _mm_setzero_ps();
00040
for (i=0;i<(len>>2);i+=2)
00041 {
00042 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+0), _mm_loadu_ps(b+0)));
00043 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+4), _mm_loadu_ps(b+4)));
00044 a += 8;
00045 b += 8;
00046 }
00047 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
00048 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
00049 _mm_store_ss(&ret, sum);
00050
return ret;
00051 }
00052
00053
static void pitch_xcorr(
const float *_x,
const float *_y,
float *corr,
int len,
int nb_pitch,
char *stack)
00054 {
00055
int i, offset;
00056 __m128 *x, *y;
00057
int N, L;
00058 N = len>>2;
00059 L = nb_pitch>>2;
00060 x =
PUSH(stack, N, __m128);
00061 y =
PUSH(stack, N+L, __m128);
00062
for (i=0;i<N;i++)
00063 x[i] = _mm_loadu_ps(_x+(i<<2));
00064
for (offset=0;offset<4;offset++)
00065 {
00066
for (i=0;i<N+L;i++)
00067 y[i] = _mm_loadu_ps(_y+(i<<2)+offset);
00068
for (i=0;i<L;i++)
00069 {
00070
int j;
00071 __m128 sum, *xx, *yy;
00072 sum = _mm_setzero_ps();
00073 yy = y+i;
00074 xx = x;
00075
for (j=0;j<N;j+=2)
00076 {
00077 sum = _mm_add_ps(sum, _mm_mul_ps(xx[0], yy[0]));
00078 sum = _mm_add_ps(sum, _mm_mul_ps(xx[1], yy[1]));
00079 xx += 2;
00080 yy += 2;
00081 }
00082 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
00083 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
00084 _mm_store_ss(corr+nb_pitch-1-(i<<2)-offset, sum);
00085 }
00086 }
00087 }