summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuillaume Seguin <guillaume@segu.in>2011-01-18 16:21:43 +0100
committerGuillaume Seguin <guillaume@segu.in>2011-01-18 16:21:43 +0100
commitdb414d71753e55cd71192ac67842d7a473544ddf (patch)
tree52481a516bf57b3032810102290db423894819d2
parentfacb0287c14800877496a50b3c673b8beb229ce5 (diff)
downloadspacetimemanifolds-sse.tar.gz
spacetimemanifolds-sse.tar.bz2
SSE optimizations attemptsse
-rw-r--r--compute.cc35
1 files changed, 27 insertions, 8 deletions
diff --git a/compute.cc b/compute.cc
index 0f1016d..1f3b46c 100644
--- a/compute.cc
+++ b/compute.cc
@@ -2,6 +2,8 @@
#include <cstdio>
#include <limits>
+#include <smmintrin.h>
+
#include "compute.h"
#include "pyramid.h"
#include "tictoc.h"
@@ -41,13 +43,21 @@ double edgecost(Mat & f, int i, Mat & g, int j)
}
*/
+typedef struct fv {
+ union {
+ float f[4];
+ __m128 v;
+ };
+} f4v;
+
inline
float blockpow_and_sum(Mat & f, Mat & g,
float p,
int fx0, int fx1, int gx0, int gx1,
int y0, int y1)
{
- register float sum = 0;
+ register __m128 sum __attribute__((aligned (16))) = _mm_setzero_ps();
+ register float sum2 = 0.;
for (register int y = y0; y < y1; ++y)
{
const float* frow = (const float*)(f.data + f.step*y);
@@ -56,21 +66,30 @@ float blockpow_and_sum(Mat & f, Mat & g,
const float* fendm4 = (const float*)(fend - 4);
register float* fx = (float*)(frow + fx0 * 3);
register float* gx = (float*)(grow + gx0 * 3);
- for (; fx < fendm4; fx += 4, gx += 4)
+ register float* f16a = (float*)(fx + 16 * ((long)fx % 16 != 0) - ((long)fx%16));
+ register float* g16a = (float*)(gx + 16 * ((long)gx % 16 != 0) - ((long)gx%16));
+ for (; fx < f16a; ++fx, ++gx)
{
register float a = *fx - *gx;
- register float b = *(fx + 1) - *(gx + 1);
- register float c = *(fx + 2) - *(gx + 2);
- register float d = *(fx + 3) - *(gx + 3);
- sum += a * a + b * b + c * c + d * d; /* FIXME : fixed L_2 norm */
+ sum2 += a * a; /* FIXME : fixed L_2 norm */
+ }
+ for (; fx < fendm4; fx += 4, gx += 4)
+ {
+ register const __m128 delta = _mm_sub_ps(_mm_loadu_ps(fx),
+ _mm_loadu_ps(gx));
+ f4v p;
+ p.v = _mm_mul_ps(delta, delta);
+ sum2 += p.f[0] + p.f[1] + p.f[2];
}
for (; fx < fend; ++fx, ++gx)
{
register float a = *fx - *gx;
- sum += a * a; /* FIXME : fixed L_2 norm */
+ sum2 += a * a; /* FIXME : fixed L_2 norm */
}
}
- return sum;
+ float r;
+ _mm_store_ss(&r, sum);
+ return r + sum2;
}
double edgecost_parallel(Mat & f, int i, Mat & g, int j)