Speedup a short to float cast?

Speedup a short to float cast? - c++

I have a short to float cast in C++ that is bottlenecking my code.
The code translates from a hardware device buffer which is natively shorts, this represents the input from a fancy photon counter.
float factor= 1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
A few details
Value should go from 0 to 2^16-1, it represents the pixel values of a highly sensitive camera
I'm on a multicore x86 machine with an i7 processor (i7 960 which is SSE 4.2 and 4.1).
Source is aligned to an 8 bit boundary (a requirement of the hardware device)
W*H is always divisible by 8, most of the time W and H are divisible by 8
This makes me sad, is there anything I can do about it?
I am using Visual Studios 2012...

Here's a basic SSE4.1 implementation:
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
// Load 8 16-bit ushorts.
// vi = {a,b,c,d,e,f,g,h}
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
// vi0 = {a,0,b,0,c,0,d,0}
// vi1 = {e,0,f,0,g,0,h,0}
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
This assumes:
source and destination are both aligned to 16 bytes.
W*H is a multiple of 8.
It's possible to do better by further unrolling this loop. (see below)
The idea here is as follows:
Load 8 shorts into a single SSE register.
Split the register into two: One with the bottom 4 shorts and the other with the top 4 shorts.
Zero-extend both registers into 32-bit integers.
Convert them both to floats.
Multiply by the factor.
Store them into destination.
EDIT :
It's been a while since I've done this type of optimization, so I went ahead and unrolled the loops.
Core i7 920 # 3.5 GHz
Visual Studio 2012 - Release x64:
Original Loop : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416
Further unrolling resulted in diminishing returns.
Here's the test code:
#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;
void default_loop(float *destination,const short* source,float value,int size){
float factor = 1.0f / value;
for (int i = 0; i < size; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void print_sum(const float *destination,int size){
float sum = 0;
for (int i = 0; i < size; i++){
sum += destination[i];
}
cout << sum << endl;
}
int main(){
int size = 8000;
short *source = (short*)_mm_malloc(size * sizeof(short), 16);
float *destination = (float*)_mm_malloc(size * sizeof(float), 16);
for (int i = 0; i < size; i++){
source[i] = i;
}
float value = 1.1;
int iterations = 1000000;
clock_t start;
// Default Loop
start = clock();
for (int it = 0; it < iterations; it++){
default_loop(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, no unroll
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll1(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, unroll 2
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll2(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
_mm_free(source);
_mm_free(destination);
system("pause");
}

I believe I have the best answer. My results are much faster than Mystical's. They only require SSE2 but take advantage of SSE3, SSE4, AVX, and even AVX2 if available. You don't have to change any code. You only have to recompile.
I ran over three sizes: 8008, 64000, and 2560*1920 = 4915200. I tried several different variations. I list the most important ones below. The function vectorize8_unroll2 is mystical's function. I made a improved version of his called vectorize8_unroll2_parallel. The function vec16_loop_unroll2_fix and vec16_loop_unroll2_parallel_fix are my functions which I believe are better than mystical's. These functions will automatically use AVX if you compile with AVX but work fine on SSE4 and even SSE2
Additionally, you wrote "W*H is always divisible by 8, most of the time W and H are divisible by 8".
So we can't assume W*H is divisible by 16 in all cases. Mystical's function vectorize8_unroll2 has a bug when size is not a multiple of 16 (try size=8008 in his code and you will see what I mean). My code has no such bug.
I'm using Ander Fog's vectorclass for the vectorization. It's not a lib or dll file. It's just a few header files. I use OpenMP for the parallelization. Here are some of the results:
Intel Xeon E5630 #2.53GHz (supports upto SSE4.2)
size 8008, size2 8032, iterations 1000000
default_loop time: 7.935 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000
size 64000, size2 64000, iterations 100000
default_loop time: 6.387 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 5.125 seconds, diff 0.000000
vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000
Edit: I added the results on a system with AVX using GCC at the end of this answer.
Below is the code. The code only looks long because I do lots of cross checks and test many variations. Download the vectorclass at
http://www.agner.org/optimize/#vectorclass . Copy the header files (vectorclass.h, instrset.h, vectorf128.h, vectorf256.h, vectorf256e.h, vectori128.h, vectori256.h, vectori256e.h) into the directory you compile from. Add /D__SSE4_2__ under C++/CommandLine. Compile in release mode. If you have a CPU with AVX then put /arch:AVX instead. Add OpenMP support under C++ properites/languages.
In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp
In the code below the function vec16_loop_unroll2_parallel requires the array be a multiple of 32. You can change the array size to be a multiple of 32 (that's what size2 refers to) or if that's not possible you can just use the function vec16_loop_unroll2_parallel_fix which has no such restriction. It's just as fast anyway.
#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline void* aligned_malloc(size_t size, size_t align) {
void *result;
#ifdef _MSC_VER
result = _aligned_malloc(size, align);
#else
if(posix_memalign(&result, align, size)) result = 0;
#endif
return result;
}
inline void aligned_free(void *ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
void default_loop(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f/value;
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f / value;
#pragma omp parallel for
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
for (; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for (; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
#pragma omp parallel for
for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for(int i = ROUND_DOWN(size, 32); i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void copy_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
b[i] = a[i];
}
}
float compare_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
float diff = a[i] - b[i];
if(diff!=0) {
printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
break;
}
sum += diff;
}
return sum;
}
void randomize_array(unsigned short* a, const int size) {
for(int i=0; i<size; i++) {
float r = (float)rand()/RAND_MAX;
a[i] = (int)(65536*r);
}
}
void run(int size, int iterations) {
int rd = ROUND_DOWN(size, 32);
int size2 = rd == size ? size : rd + 32;
float value = 1.1f;
printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);
void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
fp[0] = default_loop;
fp[1] = vec8_loop;
fp[2] = vec8_loop_unroll2;
fp[3] = vec16_loop;
fp[4] = vec16_loop_unroll2;
fp[5] = vec16_loop_unroll2_fix;
fp[6] = vectorize8_unroll1;
fp[7] = vectorize8_unroll2;
fp[8] = default_loop_parallel;
fp[9] = vec8_loop_parallel;
fp[10] = vec8_loop_unroll2_parallel;
fp[11] = vec16_loop_parallel;
fp[12] = vec16_loop_unroll2_parallel;
fp[13] = vec16_loop_unroll2_parallel_fix;
fp[14] = vectorize8_unroll1_parallel;
fp[15] = vectorize8_unroll2_parallel;
char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
"default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
"vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};
randomize_array(source, size2);
copy_arrays(destination_old, destination_ref, size);
fp[0](destination_ref, source, value, size);
for(int i=0; i<16; i++) {
copy_arrays(destination_old, destination, size);
double dtime = omp_get_wtime();
for (int it = 0; it < iterations; it++){
fp[i](destination, source, value, size);
}
dtime = omp_get_wtime() - dtime;
float diff = compare_arrays(destination, destination_ref, size);
printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
}
printf("\n");
aligned_free(source);
aligned_free(destination);
aligned_free(destination_old);
aligned_free(destination_ref);
}
int main() {
run(8008, 1000000);
run(64000, 100000);
run(2560*1920, 1000);
}
Results Using GCC on a system with AVX. GCC automatically parallelizes the loop (Visual Studio fails due to the short but works if you try int). You gain very little with hand written vectorization code. However, using multiple threads can help depending upon the array size. For the small array size 8008 OpenMP gives a worse result. However, for the larger array size 128000 using OpenMP gives much better resutls. For the largest array size 4915200 it's entirely memory bound and OpenMP does not help.
i7-2600k # 4.4GHz
size 8008, size2 8032, iterations 1000000
default_loop time: 1.319 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
vectorize8_unroll2 time: 1.227 seconds, diff 0.000000
vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000
size 128000, size2 128000, iterations 100000
default_loop time: 2.902 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
vectorize8_unroll2 time: 2.844 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 2.313 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000
vectorize8_unroll2 time: 2.318 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000

Using SSE intrinsics, on my machine [Quad Core Athlon, 3.3GHz, 16GB of RAM], and g++ -O2 optimisation [1] gives about 2.5-3x speed up. I also wrote a function to do the same thing in inline assembler, but it's not noticeably faster (again, this applies on my machine, feel free to run on other machines).
I tried a variety of sizes of H * W, and it all gives approximately the same results.
[1] Using g++ -O3 gives the same time for all four functions, as apparently -O3 enables "automatically vectorise code". So the whole thing was a bit of a waste of time assuming your compiler supports similar auto-vectorisation functionality.
Results
convert_naive sum=4373.98 t=7034751 t/n=7.03475
convert_naive sum=4373.98 t=7266738 t/n=7.26674
convert_naive sum=4373.98 t=7006154 t/n=7.00615
convert_naive sum=4373.98 t=6815329 t/n=6.81533
convert_naive sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4 sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4 sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4 sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4 sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4 sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm sum=4373.98 t=3176522 t/n=3.17652
Code
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>
#define W 1000
#define H 1000
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
void convert_naive(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void convert_unroll4(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i+=4)
{
int v1 = source[i];
int v2 = source[i+1];
int v3 = source[i+2];
int v4 = source[i+3];
destination[i] = v1*factor;
destination[i+1] = v2*factor;
destination[i+2] = v3*factor;
destination[i+3] = v4*factor;
}
}
void convert_sse_intrinsic(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__m64 zero1 = { 0,0 };
__m128i zero2 = { 0,0 };
__m64 *ps = reinterpret_cast<__m64 *>(source);
__m128 *pd = reinterpret_cast<__m128 *>(destination);
for (int i = 0; i < W*H; i+=4)
{
__m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
__m128 fval = _mm_cvtepi32_ps(value);
*pd = _mm_mul_ps(fval, factor); // destination[0,1,2,3] = value[0,1,2,3] * factor;
pd++;
ps++;
}
}
void convert_sse_inlineasm(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__asm__ __volatile__(
"\t pxor %%xmm1, %%xmm1\n"
"\t movaps %3, %%xmm2\n"
"\t mov $0, %%rax\n"
"1:"
"\t movq (%1, %%rax), %%xmm0\n"
"\t movq 8(%1, %%rax), %%xmm3\n"
"\t movq 16(%1, %%rax), %%xmm4\n"
"\t movq 24(%1, %%rax), %%xmm5\n"
"\t punpcklwd %%xmm1, %%xmm0\n"
"\t pslld $16, %%xmm0\n"
"\t psrad $16, %%xmm0\n"
"\t cvtdq2ps %%xmm0, %%xmm0\n"
"\t mulps %%xmm2, %%xmm0\n"
"\t punpcklwd %%xmm1, %%xmm3\n"
"\t pslld $16, %%xmm3\n"
"\t psrad $16, %%xmm3\n"
"\t cvtdq2ps %%xmm3, %%xmm3\n"
"\t mulps %%xmm2, %%xmm3\n"
"\t punpcklwd %%xmm1, %%xmm4\n"
"\t pslld $16, %%xmm4\n"
"\t psrad $16, %%xmm4\n"
"\t cvtdq2ps %%xmm4, %%xmm4\n"
"\t mulps %%xmm2, %%xmm4\n"
"\t punpcklwd %%xmm1, %%xmm5\n"
"\t pslld $16, %%xmm5\n"
"\t psrad $16, %%xmm5\n"
"\t cvtdq2ps %%xmm5, %%xmm5\n"
"\t mulps %%xmm2, %%xmm5\n"
"\t movaps %%xmm0, (%0, %%rax, 2)\n"
"\t movaps %%xmm3, 16(%0, %%rax, 2)\n"
"\t movaps %%xmm4, 32(%0, %%rax, 2)\n"
"\t movaps %%xmm5, 48(%0, %%rax, 2)\n"
"\t addq $32, %%rax\n"
"\t cmpq %2, %%rax\n"
"\t jbe 1b\n"
: /* no outputs */
: "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
"rax", "xmm0", "xmm1", "xmm3");
}
short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif
typedef void (*func)(short *source, float *destination);
struct BmEntry
{
const char *name;
func fn;
};
void bm(BmEntry& e)
{
memset(outbuffer, 0, sizeof(outbuffer));
unsigned long long t = rdtsc();
e.fn(inbuffer, outbuffer);
t = rdtsc() - t;
float sum = 0;
for(int i = 0; i < W * H; i++)
{
sum += outbuffer[i];
}
#if DEBUG
convert_naive(inbuffer, outbuffer2);
for(int i = 0; i < W * H; i++)
{
if (outbuffer[i] != outbuffer2[i])
{
std::cout << i << ":: " << inbuffer[i] << ": "
<< outbuffer[i] << " != " << outbuffer2[i]
<< std::endl;
}
}
#endif
std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t <<
" t/n=" << (double)t / (W * H) << std::endl;
}
#define BM(x) { #x, x }
BmEntry table[] =
{
BM(convert_naive),
BM(convert_unroll4),
BM(convert_sse_intrinsic),
BM(convert_sse_inlineasm),
};
int main()
{
for(int i = 0; i < W * H; i++)
{
inbuffer[i] = (short)i;
}
for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
{
for(int j = 0; j < 5; j++)
bm(table[i]);
}
return 0;
}

No sure if the condition expression in the loop is evaluated only once.
You can try:
float factor= 1.0f/value;
for (int i = 0, count = W*H; i < count; ++i)//25% of time is spent doing this
{
int value = source[i];//short -> int
destination[i] = value*factor;//int->float
}

This is not a valid answer, don't take it as it, but I'm actually wondering how would the code behave by using a 256k look-up table. (basically a 'short to float' table with 65536 entries).
A CoreI7 has about 8 megabytes of cache I believe, so the look-up table would fit in the data cache.
I really wonder how that would impact the performance :)

and You can use OpenMP to hire every core of your CPU, and it is simple just do as following:
#include <omp.h>
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
here is the result based on previous program, just add the like this:
#pragma omp parallel for
for (int it = 0; it < iterations; it++){
...
}
and then here is the result
beta#beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp
beta#beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007
beta#beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3
beta#beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007
. .
result shows 100% improvment with openmp. Visual C++ supports openmp too.

You could try to approximate the expression
float factor = 1.0f/value;
by an fraction numerator/denomitator where both numerator and denominator are ints. This can be done to the precision you need in your application like
int denominator = 10000;
int numerator = factor * denominator;
Then you can do your computation in integer arithmetics like
int value = source[i];
destination[i] = (value * numerator) / numerator;
You have to take care of overflows, perhaps you need to switch to long (or even long long on 64bit systems) for the calculation.

Related

SSE mean filter in c++ and OpenCV

I would like to modify the code for an OpenCV mean filter to use Intel intrinsics. I'm an SSE newbie and I really don't know where to start from. I checked a lot of resources on the web, but I didn't have a lot of success.
This is the program:
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace cv;
using namespace std;
int main()
{
int A[3][3] = { { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } };
int c = 0;
int d = 0;
Mat var1 = imread("images.jpg", 1);
Mat var2(var1.rows, var1.cols, CV_8UC3, Scalar(0, 0, 0));
for (int i = 0; i < var1.rows; i++)
{
var2.at<Vec3b>(i, 0) = var1.at<Vec3b>(i, 0);
var2.at<Vec3b>(i, var1.cols - 1) = var1.at<Vec3b>(i, var1.cols - 1);
}
for (int i = 0; i < var1.cols; i++)
{
var2.at<Vec3b>(0, i) = var1.at<Vec3b>(0, i);
var2.at<Vec3b>(var1.rows - 1, i) = var1.at<Vec3b>(var1.rows - 1, i);
}
for (int i = 0; i < var1.rows; i++) {
for (int j = 0; j < var1.cols; j++)
{
c = 0;
for (int m = i; m < var1.rows; m++, c++)
{
if (c < 3)
{
d = 0;
for (int n = j; n < var1.cols; n++, d++)
{
if (d < 3)
{
if ((i + 1) < var1.rows && (j + 1) < var1.cols)
{
var2.at<Vec3b>(i + 1, j + 1)[0] += var1.at<Vec3b>(m, n)[0] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[1] += var1.at<Vec3b>(m, n)[1] * A[m - i][n - j] / 9;
var2.at<Vec3b>(i + 1, j + 1)[2] += var1.at<Vec3b>(m, n)[2] * A[m - i][n - j] / 9;
}
}
}
}
}
}
}
imshow("window1", var1);
imshow("window2", var2);
waitKey(0);
return(0);
}
The part that I find difficult is understanding how to convert the innermost 2 loops, where the mean value is computed. Any help will be greatly appreciated.

Just for fun, I thought it might be interesting to start with a naive implementation of a 3x3 mean filter and then optimise this incrementally, ending up with a SIMD (SSE) implementation, measuring the throughput improvement at each stage.
1 - Mean_3_3_ref - reference implementation
This is just a simple scalar implementation which we'll use as a baseline for throughput and for validating further implementations:
void Mean_3_3_ref(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
for (int x = 1; x < image_in.cols - 1; ++x)
{
for (int c = 0; c < 3; ++c)
{
image_out.at<Vec3b>(y, x)[c] = (image_in.at<Vec3b>(y - 1, x - 1)[c] +
image_in.at<Vec3b>(y - 1, x )[c] +
image_in.at<Vec3b>(y - 1, x + 1)[c] +
image_in.at<Vec3b>(y , x - 1)[c] +
image_in.at<Vec3b>(y , x )[c] +
image_in.at<Vec3b>(y , x + 1)[c] +
image_in.at<Vec3b>(y + 1, x - 1)[c] +
image_in.at<Vec3b>(y + 1, x )[c] +
image_in.at<Vec3b>(y + 1, x + 1)[c] + 4) / 9;
}
}
}
}
2 - Mean_3_3_scalar - somewhat optimised scalar implementation
Exploit the redundancy in summing successive columns - we save the last two column sums so that we only need to calculate one new column sum (per channel) on each iteration:
void Mean_3_3_scalar(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
int r_1, g_1, b_1;
int r0, g0, b0;
int r1, g1, b1;
r_1 = g_1 = b_1 = 0;
r0 = g0 = b0 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r_1 += image_in.at<Vec3b>(yy, 0)[0];
g_1 += image_in.at<Vec3b>(yy, 0)[1];
b_1 += image_in.at<Vec3b>(yy, 0)[2];
r0 += image_in.at<Vec3b>(yy, 1)[0];
g0 += image_in.at<Vec3b>(yy, 1)[1];
b0 += image_in.at<Vec3b>(yy, 1)[2];
}
for (int x = 1; x < image_in.cols - 1; ++x)
{
r1 = g1 = b1 = 0;
for (int yy = y - 1; yy <= y + 1; ++yy)
{
r1 += image_in.at<Vec3b>(yy, x + 1)[0];
g1 += image_in.at<Vec3b>(yy, x + 1)[1];
b1 += image_in.at<Vec3b>(yy, x + 1)[2];
}
image_out.at<Vec3b>(y, x)[0] = (r_1 + r0 + r1 + 4) / 9;
image_out.at<Vec3b>(y, x)[1] = (g_1 + g0 + g1 + 4) / 9;
image_out.at<Vec3b>(y, x)[2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
3 - Mean_3_3_scalar_opt - further optimised scalar implementation
As per Mean_3_3_scalar, but also remove OpenCV overheads by caching pointers to each row that we are working on:
void Mean_3_3_scalar_opt(const Mat &image_in, Mat &image_out)
{
for (int y = 1; y < image_in.rows - 1; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
int r_1 = input_1[0] + input0[0] + input1[0];
int g_1 = input_1[1] + input0[1] + input1[1];
int b_1 = input_1[2] + input0[2] + input1[2];
int r0 = input_1[3] + input0[3] + input1[3];
int g0 = input_1[4] + input0[4] + input1[4];
int b0 = input_1[5] + input0[5] + input1[5];
for (int x = 1; x < image_in.cols - 1; ++x)
{
int r1 = input_1[x * 3 + 3] + input0[x * 3 + 3] + input1[x * 3 + 3];
int g1 = input_1[x * 3 + 4] + input0[x * 3 + 4] + input1[x * 3 + 4];
int b1 = input_1[x * 3 + 5] + input0[x * 3 + 5] + input1[x * 3 + 5];
output[x * 3 ] = (r_1 + r0 + r1 + 4) / 9;
output[x * 3 + 1] = (g_1 + g0 + g1 + 4) / 9;
output[x * 3 + 2] = (b_1 + b0 + b1 + 4) / 9;
r_1 = r0;
g_1 = g0;
b_1 = b0;
r0 = r1;
g0 = g1;
b0 = b1;
}
}
}
4 - Mean_3_3_blur - leverage OpenCV's blur function
OpenCV has a function called blur, which is based on the function boxFilter, which is just another name for a mean filter. Since OpenCV code has been quite heavily optimised over the years (using SIMD in many cases), let's see if this makes a big improvement over our scalar code:
void Mean_3_3_blur(const Mat &image_in, Mat &image_out)
{
blur(image_in, image_out, Size(3, 3));
}
5 - Mean_3_3_SSE - SSE implementation
This a reasonably efficient SIMD implementation. It uses the same techniques as the scalar code above in order to eliminate redundancy in processing successive pixels:
#include <tmmintrin.h> // Note: requires SSSE3 (aka MNI)
inline void Load2(const ssize_t offset, const uint8_t* const src, __m128i& vh, __m128i& vl)
{
const __m128i v = _mm_loadu_si128((__m128i *)(src + offset));
vh = _mm_unpacklo_epi8(v, _mm_setzero_si128());
vl = _mm_unpackhi_epi8(v, _mm_setzero_si128());
}
inline void Store2(const ssize_t offset, uint8_t* const dest, const __m128i vh, const __m128i vl)
{
__m128i v = _mm_packus_epi16(vh, vl);
_mm_storeu_si128((__m128i *)(dest + offset), v);
}
template <int SHIFT> __m128i ShiftL(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, SHIFT * sizeof(short)); }
template <int SHIFT> __m128i ShiftR(const __m128i v0, const __m128i v1) { return _mm_alignr_epi8(v1, v0, 16 - SHIFT * sizeof(short)); }
template <int CHANNELS> void Mean_3_3_SSE_Impl(const Mat &image_in, Mat &image_out)
{
const int nx = image_in.cols;
const int ny = image_in.rows;
const int kx = 3 / 2; // x, y borders
const int ky = 3 / 2;
const int kScale = 3 * 3; // scale factor = total number of pixels in sum
const __m128i vkScale = _mm_set1_epi16((32768 + kScale / 2) / kScale);
const int nx0 = ((nx + kx) * CHANNELS + 15) & ~15; // round up total width to multiple of 16
int x, y;
for (y = ky; y < ny - ky; ++y)
{
const uint8_t * const input_1 = image_in.ptr(y - 1);
const uint8_t * const input0 = image_in.ptr(y);
const uint8_t * const input1 = image_in.ptr(y + 1);
uint8_t * const output = image_out.ptr(y);
__m128i vsuml_1, vsumh0, vsuml0;
__m128i vh, vl;
vsuml_1 = _mm_set1_epi16(0);
Load2(0, input_1, vsumh0, vsuml0);
Load2(0, input0, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
Load2(0, input1, vh, vl);
vsumh0 = _mm_add_epi16(vsumh0, vh);
vsuml0 = _mm_add_epi16(vsuml0, vl);
for (x = 0; x < nx0; x += 16)
{
__m128i vsumh1, vsuml1, vsumh, vsuml;
Load2((x + 16), input_1, vsumh1, vsuml1);
Load2((x + 16), input0, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
Load2((x + 16), input1, vh, vl);
vsumh1 = _mm_add_epi16(vsumh1, vh);
vsuml1 = _mm_add_epi16(vsuml1, vl);
vsumh = _mm_add_epi16(vsumh0, ShiftR<CHANNELS>(vsuml_1, vsumh0));
vsuml = _mm_add_epi16(vsuml0, ShiftR<CHANNELS>(vsumh0, vsuml0));
vsumh = _mm_add_epi16(vsumh, ShiftL<CHANNELS>(vsumh0, vsuml0));
vsuml = _mm_add_epi16(vsuml, ShiftL<CHANNELS>(vsuml0, vsumh1));
// round mean
vsumh = _mm_mulhrs_epi16(vsumh, vkScale);
vsuml = _mm_mulhrs_epi16(vsuml, vkScale);
Store2(x, output, vsumh, vsuml);
vsuml_1 = vsuml0;
vsumh0 = vsumh1;
vsuml0 = vsuml1;
}
}
}
void Mean_3_3_SSE(const Mat &image_in, Mat &image_out)
{
const int channels = image_in.channels();
switch (channels)
{
case 1:
Mean_3_3_SSE_Impl<1>(image_in, image_out);
break;
case 3:
Mean_3_3_SSE_Impl<3>(image_in, image_out);
break;
default:
throw("Unsupported format.");
break;
}
}
Results
I benchmarked all of the above implementations on an 8th gen Core i9 (MacBook Pro 16,1) at 2.4 GHz, with an image size of 2337 rows x 3180 cols. The compiler was Apple clang version 12.0.5 (clang-1205.0.22.9) and the only optimisation switch was -O3. OpenCV version was 4.5.0 (via Homebrew). (Note: I verified that for Mean_3_3_blur the cv::blur function was dispatched to an AVX2 implementation.) The results:
Mean_3_3_ref 62153 µs
Mean_3_3_scalar 41144 µs = 1.51062x
Mean_3_3_scalar_opt 26238 µs = 2.36882x
Mean_3_3_blur 20121 µs = 3.08896x
Mean_3_3_SSE 4838 µs = 12.84680x
Notes
I have ignored the border pixels in all implementations - if required these can either be filled with pixels from the original image or using some other form of edge pixel processing.
The code is not "industrial strength" - it was only written for benchmarking purposes.
There are a few further possible optimisations, e.g. use wider SIMD (AVX2, AVX512), exploit the redundancy between successive rows, etc - these are left as an exercise for the reader.
The SSE implementation is fastest, but this comes at the cost of increased complexity, decreased mantainability and reduced portability.
The OpenCV blur function gives the second best performance, and should probably be the preferred solution if it meets throughput requirements - it's the simplest solution, and simple is good.

What's wrong in this SSE2 transposition?

I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}

(update: this answer was written before the edits that show v_phase being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase at each step. Yeah, there was a // some other code (that will use phase) comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase) to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod on the result to range-reduce into the [0..2Pi) range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1] before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.

SSE addition and conversion

Here's the thing, how can I add two unsigned char arrays and store the result in an unsigned short array by using SSE. Can anyone give me some help or hint. This is what I have done so far. I just don't know where the error is..need some help
#include<iostream>
#include<intrin.h>
#include<windows.h>
#include<emmintrin.h>
#include<iterator>
using namespace std;
void sse_add(unsigned char * input1, unsigned char *input2, unsigned short *output, const int N)
{
unsigned char *op3 = new unsigned char[N];
unsigned char *op4 = new unsigned char[N];
__m128i *sse_op3 = (__m128i*)op3;
__m128i *sse_op4 = (__m128i*)op4;
__m128i *sse_result = (__m128i*)output;
for (int i = 0; i < N; i = i + 16)
{
__m128i src = _mm_loadu_si128((__m128i*)input1);
__m128i zero = _mm_setzero_si128();
__m128i higher = _mm_unpackhi_epi8(src, zero);
__m128i lower = _mm_unpacklo_epi8(src, zero);
_mm_storeu_si128(sse_op3, lower);
sse_op3 = sse_op3 + 1;
_mm_storeu_si128(sse_op3, higher);
sse_op3 = sse_op3 + 1;
input1 = input1 + 16;
}
for (int j = 0; j < N; j = j + 16)
{
__m128i src1 = _mm_loadu_si128((__m128i*)input2);
__m128i zero1 = _mm_setzero_si128();
__m128i higher1 = _mm_unpackhi_epi8(src1, zero1);
__m128i lower1 = _mm_unpacklo_epi8(src1, zero1);
_mm_storeu_si128(sse_op4, lower1);
sse_op4 = sse_op4 + 1;
_mm_storeu_si128(sse_op4, higher1);
sse_op4 = sse_op4 + 1;
input2 = input2 + 16;
}
__m128i *sse_op3_new = (__m128i*)op3;
__m128i *sse_op4_new = (__m128i*)op4;
for (int y = 0; y < N; y = y + 8)
{
*sse_result = _mm_adds_epi16(*sse_op3_new, *sse_op4_new);
sse_result = sse_result + 1;
sse_op3_new = sse_op3_new + 1;
sse_op4_new = sse_op4_new + 1;
}
}
void C_add(unsigned char * input1, unsigned char *input2, unsigned short *output, int N)
{
for (int i = 0; i < N; i++)
output[i] = (unsigned short)input1[i] + (unsigned short)input2[i];
}
int main()
{
int n = 1023;
unsigned char *p0 = new unsigned char[n];
unsigned char *p1 = new unsigned char[n];
unsigned short *p21 = new unsigned short[n];
unsigned short *p22 = new unsigned short[n];
for (int j = 0; j < n; j++)
{
p21[j] = rand() % 256;
p22[j] = rand() % 256;
}
C_add(p0, p1, p22, n);
cout << "C_add finished!" << endl;
sse_add(p0, p1, p21, n);
cout << "sse_add finished!" << endl;
for (int j = 0; j < n; j++)
{
if (p21[j] != p22[j])
{
cout << "diff!!!!!#######" << endl;
}
}
//system("pause");
delete[] p0;
delete[] p1;
delete[] p21;
delete[] p22;
return 0;
}

Assuming everything is aligned to _Alignof(__m128i) and the size of the array is a multiple of sizeof(__m128i), something like this should work:
void addw(size_t size, uint16_t res[size], uint8_t a[size], uint8_t b[size]) {
__m128i* r = (__m128i*) res;
__m128i* ap = (__m128i*) a;
__m128i* bp = (__m128i*) b;
for (size_t i = 0 ; i < (size / sizeof(__m128i)) ; i++) {
r[(i * 2)] = _mm_add_epi16(_mm_cvtepu8_epi16(ap[i]), _mm_cvtepu8_epi16(bp[i]));
r[(i * 2) + 1] = _mm_add_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(ap[i], 8)), _mm_cvtepu8_epi16(_mm_srli_si128(bp[i], 8)));
}
}
FWIW, NEON would be a bit simpler (using vaddl_u8 and vaddl_high_u8).
If you're dealing with unaligned data you can use _mm_loadu_si128/_mm_storeu_si128. If size isn't a multiple of 16 you'll just have to do the remainder without SSE.
Note that this may be something your compiler can do automatically (I haven't checked). You may want to try something like this:
#pragma omp simd
for (size_t i = 0 ; i < size ; i++) {
res[i] = ((uint16_t) a[i]) + ((uint16_t) b[i]);
}
That uses OpenMP 4, but there is also Cilk++ (#pragma simd), clang (#pragma clang loop vectorize(enable)), gcc (#pragma GCC ivdep), or you could just hope the compiler is smart enough without the pragma hint.

Fast implementation of covariance of two 8-bit arrays

I need to compare a big amount of similar images of small size (up to 200x200).
So I try to implement SSIM (Structural similarity see https://en.wikipedia.org/wiki/Structural_similarity ) algorithm.
SSIM requires calculation of covariance of two 8-bit gray images.
A trivial implementation look like:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
for(size_t i = 0; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
But it has poor performance.
So I hope to improve it with using SIMD or CUDA (I heard that it can be done).
Unfortunately I have no experience to do this.
How it will look? And where I have to go?

I have another nice solution!
At first I want to mention some mathematical formulas:
averageX = Sum(x[i])/size;
averageY = Sum(y[i])/size;
And therefore:
Sum((x[i] - averageX)*(y[i] - averageY))/size =
Sum(x[i]*y[i])/size - Sum(x[i]*averageY)/size -
Sum(averageX*y[i])/size + Sum(averageX*averageY)/size =
Sum(x[i]*y[i])/size - averageY*Sum(x[i])/size -
averageX*Sum(y[i])/size + averageX*averageY*Sum(1)/size =
Sum(x[i]*y[i])/size - averageY*averageX -
averageX*averageY + averageX*averageY =
Sum(x[i]*y[i])/size - averageY*averageX;
It allows to modify our algorithm:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0; // If images will have size greater then 256x256 than you have to use uint64_t.
for(size_t i = 0; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
And only after that we can use SIMD (I used SSE2):
#include <emmintrin.h>
inline __m128i SigmaXY(__m128i x, __m128i y)
{
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()), _mm_unpacklo_epi8(y, _mm_setzero_si128()));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(y, _mm_setzero_si128()), _mm_unpackhi_epi8(y, _mm_setzero_si128()));
return _mm_add_epi32(lo, hi);
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128i sums = _mm_setzero_si128();
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_epi32(sums, SigmaXY(_x, _y));
}
uint32_t _sums[4];
_mm_storeu_si128(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}

There is a SIMD implementation of the algorithm (I used SSE4.1):
#include <smmintrin.h>
template <int shift> inline __m128 SigmaXY(const __m128i & x, const __m128i & y, __m128 & averageX, __m128 & averageY)
{
__m128 _x = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(x, shift)));
__m128 _y = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(y, shift)));
return _mm_mul_ps(_mm_sub_ps(_x, averageX), _mm_sub_ps(_y, averageY))
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128 sums = _mm_setzero_ps();
__m128 avgX = _mm_set1_ps(averageX);
__m128 avgY = _mm_set1_ps(averageY);
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_ps(sums, SigmaXY<0>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<4>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<8>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<12>(_x, _y, avgX, avgY);
}
float _sums[4];
_mm_storeu_ps(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
I hope that it will useful for you.

Find largest element in matrix and its column and row indexes using SSE and AVX

I need to find the largest element in 1d matrix and its column and row indexes.
I use 1d matrix, so just finding the max element's index is needed first and then it is easy to get row and column.
My problem is that I cannot get that index.
I have a working function that finds largest element and uses SSE, here it is:
float find_largest_element_in_matrix_SSE(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m128 max_el = _mm_loadu_ps(m);
__m128 curr;
for (i = 4; i < dims * dims; i += 4)
{
curr = _mm_loadu_ps(m + i);
max_el = _mm_max_ps(max_el, curr);
}
__declspec(align(16))float max_v[4] = { 0 };
_mm_store_ps(max_v, max_el);
return max(max(max(max_v[0], max_v[1]), max_v[2]), max_v[3]);
}
and also I have a non-working function that uses AVX:
float find_largest_element_in_matrix_AVX(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m256 max_el = _mm256_loadu_ps(m);
__m256 curr;
for (i = 8; i < dims * dims; i += 8)
{
curr = _mm256_loadu_ps(m + i);
max_el = _mm256_max_ps(max_el, curr);
}
__declspec(align(32))float max_v[8] = { 0 };
_mm256_store_ps(max_v, max_el);
__m256 y = _mm256_permute2f128_ps(max_el, max_el, 1);
__m256 m1 = _mm256_max_ps(max_el, y);m1[1] = max(max_el[1], max_el[3])
__m256 m2 = _mm256_permute_ps(m1, 5);
__m256 m_res = _mm256_max_ps(m1, m2);
return m[0];
}
Could anyone help me with actually finding the index of the max element and make my AVX version work?

Here's a working SSE (SSE 4) implementation that returns the max val and corresponding index, along with a scalar reference implementation and test harness:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#include <smmintrin.h> // SSE 4.1
float find_largest_element_in_matrix_ref(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
int i;
*maxIndex = 0;
for (i = 1; i < dims * dims; ++i)
{
if (m[i] > maxVal)
{
maxVal = m[i];
*maxIndex = i;
}
}
return maxVal;
}
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
float aMaxVal[4];
int32_t aMaxIndex[4];
int i;
*maxIndex = 0;
const __m128i vIndexInc = _mm_set1_epi32(4);
__m128i vMaxIndex = _mm_setr_epi32(0, 1, 2, 3);
__m128i vIndex = vMaxIndex;
__m128 vMaxVal = _mm_loadu_ps(m);
for (i = 4; i < dims * dims; i += 4)
{
__m128 v = _mm_loadu_ps(&m[i]);
__m128 vcmp = _mm_cmpgt_ps(v, vMaxVal);
vIndex = _mm_add_epi32(vIndex, vIndexInc);
vMaxVal = _mm_max_ps(vMaxVal, v);
vMaxIndex = _mm_blendv_epi8(vMaxIndex, vIndex, _mm_castps_si128(vcmp));
}
_mm_storeu_ps(aMaxVal, vMaxVal);
_mm_storeu_si128((__m128i *)aMaxIndex, vMaxIndex);
maxVal = aMaxVal[0];
*maxIndex = aMaxIndex[0];
for (i = 1; i < 4; ++i)
{
if (aMaxVal[i] > maxVal)
{
maxVal = aMaxVal[i];
*maxIndex = aMaxIndex[i];
}
}
return maxVal;
}
int main()
{
const int dims = 1024;
float m[dims * dims];
float maxVal_ref, maxVal_SSE;
int maxIndex_ref = -1, maxIndex_SSE = -1;
int i;
srand(time(NULL));
for (i = 0; i < dims * dims; ++i)
{
m[i] = (float)rand() / RAND_MAX;
}
maxVal_ref = find_largest_element_in_matrix_ref(m, dims, &maxIndex_ref);
maxVal_SSE = find_largest_element_in_matrix_SSE(m, dims, &maxIndex_SSE);
if (maxVal_ref == maxVal_SSE && maxIndex_ref == maxIndex_SSE)
{
printf("PASS: maxVal = %f, maxIndex = %d\n",
maxVal_ref, maxIndex_ref);
}
else
{
printf("FAIL: maxVal_ref = %f, maxVal_SSE = %f, maxIndex_ref = %d, maxIndex_SSE = %d\n",
maxVal_ref, maxVal_SSE, maxIndex_ref, maxIndex_SSE);
}
return 0;
}
Compile and run:
$ gcc -Wall -msse4 Yakovenko.c && ./a.out
PASS: maxVal = 0.999999, maxIndex = 120409
Obviously you can get the row and column indices if needed:
int rowIndex = maxIndex / dims;
int colIndex = maxIndex % dims;
From here it should be fairly straightforward to write an AVX2 implementation.

One approach would be to calculate maximum in the first pass, and find the index by linear search in the second pass. Here is a sample implementation in SSE2:
#define anybit __builtin_ctz //or lookup table with 16 entries...
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex) {
//first pass: calculate maximum as usual
__m128 vMaxVal = _mm_loadu_ps(m);
for (int i = 4; i < dims * dims; i += 4)
vMaxVal = _mm_max_ps(vMaxVal, _mm_loadu_ps(&m[i]));
//perform in-register reduction
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
//second pass: search for maximal value
for (int i = 0; i < dims * dims; i += 4) {
__m128 vIsMax = _mm_cmpeq_ps(vMaxVal, _mm_loadu_ps(&m[i]));
if (int mask = _mm_movemask_ps(vIsMax)) {
*maxIndex = i + anybit(mask);
return _mm_cvtss_f32(vMaxVal);
}
}
}
Note that the branch in the second loop should be almost perfectly predicted unless your input data is very small.
The solution suffers from several problems, notably:
It may work incorrectly in presence of weird floating point values, e.g. with NaNs.
If your matrix does not fit into CPU cache, then the code would read the matrix twice from the main memory, so it would be two times slower than the single-pass approach. This can be solved for large matrices by block-wise processing.
In the first loop each iteration depends on the previous one (vMaxVal is both modified and read) so it would be slowed down by latency of _mm_max_ps. Perhaps it would be great to unroll the first loop a bit (2x or 4x), while having 4 independent registers for vMaxVal (actually, the second loop would also benefit from unrolling).
Porting to AVX should be pretty straight-forward, except for the in-register reduction:
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_permute2f128_ps(vMaxVal, vMaxVal, 1));

yet another approach:
void find_largest_element_in_matrix_SSE(float * matrix, size_t n, int * row, int * column, float * v){
__m128 indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 update = _mm_setr_ps(4, 4, 4, 4);
__m128 max_indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 max = _mm_load_ps(matrix);
for (int i = 4; i < n * n; i+=4){
indecies = _mm_add_ps(indecies, update);
__m128 pm2 = _mm_load_ps(&matrix[i]);
__m128 mask = _mm_cmpge_ps(max, pm2);
max = _mm_max_ps(max, pm2);
max_indecies = _mm_or_ps(_mm_and_ps(max_indecies, mask), _mm_andnot_ps(mask, indecies));
}
__declspec (align(16)) int max_ind[4];
__m128i maxi = _mm_cvtps_epi32(max_indecies);
_mm_store_si128((__m128i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 4; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
void find_largest_element_in_matrix_AVX(float * matrix, size_t n, int * row, int * column, float * v){
__m256 indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 update = _mm256_setr_ps(8, 8, 8, 8, 8, 8, 8, 8);
__m256 max_indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 max = _mm256_load_ps(matrix);
for (int i = 8; i < n * n; i += 8){
indecies = _mm256_add_ps(indecies, update);
__m256 pm2 = _mm256_load_ps(&matrix[i]);
__m256 mask = _mm256_cmp_ps(max, pm2, _CMP_GE_OQ);
max = _mm256_max_ps(max, pm2);
max_indecies = _mm256_or_ps(_mm256_and_ps(max_indecies, mask), _mm256_andnot_ps(mask, indecies));
}
__declspec (align(32)) int max_ind[8];
__m256i maxi = _mm256_cvtps_epi32(max_indecies);
_mm256_store_si256((__m256i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 8; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js