I have a short to float cast in C++ that is bottlenecking my code.
The code translates from a hardware device buffer which is natively shorts, this represents the input from a fancy photon counter.
float factor= 1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
A few details
Value should go from 0 to 2^16-1, it represents the pixel values of a highly sensitive camera
I'm on a multicore x86 machine with an i7 processor (i7 960 which is SSE 4.2 and 4.1).
Source is aligned to an 8 bit boundary (a requirement of the hardware device)
W*H is always divisible by 8, most of the time W and H are divisible by 8
This makes me sad, is there anything I can do about it?
I am using Visual Studios 2012...
Here's a basic SSE4.1 implementation:
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
// Load 8 16-bit ushorts.
// vi = {a,b,c,d,e,f,g,h}
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
// vi0 = {a,0,b,0,c,0,d,0}
// vi1 = {e,0,f,0,g,0,h,0}
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
This assumes:
source and destination are both aligned to 16 bytes.
W*H is a multiple of 8.
It's possible to do better by further unrolling this loop. (see below)
The idea here is as follows:
Load 8 shorts into a single SSE register.
Split the register into two: One with the bottom 4 shorts and the other with the top 4 shorts.
Zero-extend both registers into 32-bit integers.
Convert them both to floats.
Multiply by the factor.
Store them into destination.
EDIT :
It's been a while since I've done this type of optimization, so I went ahead and unrolled the loops.
Core i7 920 # 3.5 GHz
Visual Studio 2012 - Release x64:
Original Loop : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416
Further unrolling resulted in diminishing returns.
Here's the test code:
#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;
void default_loop(float *destination,const short* source,float value,int size){
float factor = 1.0f / value;
for (int i = 0; i < size; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void print_sum(const float *destination,int size){
float sum = 0;
for (int i = 0; i < size; i++){
sum += destination[i];
}
cout << sum << endl;
}
int main(){
int size = 8000;
short *source = (short*)_mm_malloc(size * sizeof(short), 16);
float *destination = (float*)_mm_malloc(size * sizeof(float), 16);
for (int i = 0; i < size; i++){
source[i] = i;
}
float value = 1.1;
int iterations = 1000000;
clock_t start;
// Default Loop
start = clock();
for (int it = 0; it < iterations; it++){
default_loop(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, no unroll
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll1(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, unroll 2
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll2(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
_mm_free(source);
_mm_free(destination);
system("pause");
}
I believe I have the best answer. My results are much faster than Mystical's. They only require SSE2 but take advantage of SSE3, SSE4, AVX, and even AVX2 if available. You don't have to change any code. You only have to recompile.
I ran over three sizes: 8008, 64000, and 2560*1920 = 4915200. I tried several different variations. I list the most important ones below. The function vectorize8_unroll2 is mystical's function. I made a improved version of his called vectorize8_unroll2_parallel. The function vec16_loop_unroll2_fix and vec16_loop_unroll2_parallel_fix are my functions which I believe are better than mystical's. These functions will automatically use AVX if you compile with AVX but work fine on SSE4 and even SSE2
Additionally, you wrote "W*H is always divisible by 8, most of the time W and H are divisible by 8".
So we can't assume W*H is divisible by 16 in all cases. Mystical's function vectorize8_unroll2 has a bug when size is not a multiple of 16 (try size=8008 in his code and you will see what I mean). My code has no such bug.
I'm using Ander Fog's vectorclass for the vectorization. It's not a lib or dll file. It's just a few header files. I use OpenMP for the parallelization. Here are some of the results:
Intel Xeon E5630 #2.53GHz (supports upto SSE4.2)
size 8008, size2 8032, iterations 1000000
default_loop time: 7.935 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000
size 64000, size2 64000, iterations 100000
default_loop time: 6.387 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 5.125 seconds, diff 0.000000
vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000
Edit: I added the results on a system with AVX using GCC at the end of this answer.
Below is the code. The code only looks long because I do lots of cross checks and test many variations. Download the vectorclass at
http://www.agner.org/optimize/#vectorclass . Copy the header files (vectorclass.h, instrset.h, vectorf128.h, vectorf256.h, vectorf256e.h, vectori128.h, vectori256.h, vectori256e.h) into the directory you compile from. Add /D__SSE4_2__ under C++/CommandLine. Compile in release mode. If you have a CPU with AVX then put /arch:AVX instead. Add OpenMP support under C++ properites/languages.
In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp
In the code below the function vec16_loop_unroll2_parallel requires the array be a multiple of 32. You can change the array size to be a multiple of 32 (that's what size2 refers to) or if that's not possible you can just use the function vec16_loop_unroll2_parallel_fix which has no such restriction. It's just as fast anyway.
#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline void* aligned_malloc(size_t size, size_t align) {
void *result;
#ifdef _MSC_VER
result = _aligned_malloc(size, align);
#else
if(posix_memalign(&result, align, size)) result = 0;
#endif
return result;
}
inline void aligned_free(void *ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
void default_loop(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f/value;
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f / value;
#pragma omp parallel for
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
for (; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for (; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
#pragma omp parallel for
for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for(int i = ROUND_DOWN(size, 32); i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void copy_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
b[i] = a[i];
}
}
float compare_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
float diff = a[i] - b[i];
if(diff!=0) {
printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
break;
}
sum += diff;
}
return sum;
}
void randomize_array(unsigned short* a, const int size) {
for(int i=0; i<size; i++) {
float r = (float)rand()/RAND_MAX;
a[i] = (int)(65536*r);
}
}
void run(int size, int iterations) {
int rd = ROUND_DOWN(size, 32);
int size2 = rd == size ? size : rd + 32;
float value = 1.1f;
printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);
void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
fp[0] = default_loop;
fp[1] = vec8_loop;
fp[2] = vec8_loop_unroll2;
fp[3] = vec16_loop;
fp[4] = vec16_loop_unroll2;
fp[5] = vec16_loop_unroll2_fix;
fp[6] = vectorize8_unroll1;
fp[7] = vectorize8_unroll2;
fp[8] = default_loop_parallel;
fp[9] = vec8_loop_parallel;
fp[10] = vec8_loop_unroll2_parallel;
fp[11] = vec16_loop_parallel;
fp[12] = vec16_loop_unroll2_parallel;
fp[13] = vec16_loop_unroll2_parallel_fix;
fp[14] = vectorize8_unroll1_parallel;
fp[15] = vectorize8_unroll2_parallel;
char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
"default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
"vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};
randomize_array(source, size2);
copy_arrays(destination_old, destination_ref, size);
fp[0](destination_ref, source, value, size);
for(int i=0; i<16; i++) {
copy_arrays(destination_old, destination, size);
double dtime = omp_get_wtime();
for (int it = 0; it < iterations; it++){
fp[i](destination, source, value, size);
}
dtime = omp_get_wtime() - dtime;
float diff = compare_arrays(destination, destination_ref, size);
printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
}
printf("\n");
aligned_free(source);
aligned_free(destination);
aligned_free(destination_old);
aligned_free(destination_ref);
}
int main() {
run(8008, 1000000);
run(64000, 100000);
run(2560*1920, 1000);
}
Results Using GCC on a system with AVX. GCC automatically parallelizes the loop (Visual Studio fails due to the short but works if you try int). You gain very little with hand written vectorization code. However, using multiple threads can help depending upon the array size. For the small array size 8008 OpenMP gives a worse result. However, for the larger array size 128000 using OpenMP gives much better resutls. For the largest array size 4915200 it's entirely memory bound and OpenMP does not help.
i7-2600k # 4.4GHz
size 8008, size2 8032, iterations 1000000
default_loop time: 1.319 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
vectorize8_unroll2 time: 1.227 seconds, diff 0.000000
vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000
size 128000, size2 128000, iterations 100000
default_loop time: 2.902 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
vectorize8_unroll2 time: 2.844 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 2.313 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000
vectorize8_unroll2 time: 2.318 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000
Using SSE intrinsics, on my machine [Quad Core Athlon, 3.3GHz, 16GB of RAM], and g++ -O2 optimisation [1] gives about 2.5-3x speed up. I also wrote a function to do the same thing in inline assembler, but it's not noticeably faster (again, this applies on my machine, feel free to run on other machines).
I tried a variety of sizes of H * W, and it all gives approximately the same results.
[1] Using g++ -O3 gives the same time for all four functions, as apparently -O3 enables "automatically vectorise code". So the whole thing was a bit of a waste of time assuming your compiler supports similar auto-vectorisation functionality.
Results
convert_naive sum=4373.98 t=7034751 t/n=7.03475
convert_naive sum=4373.98 t=7266738 t/n=7.26674
convert_naive sum=4373.98 t=7006154 t/n=7.00615
convert_naive sum=4373.98 t=6815329 t/n=6.81533
convert_naive sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4 sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4 sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4 sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4 sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4 sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm sum=4373.98 t=3176522 t/n=3.17652
Code
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>
#define W 1000
#define H 1000
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
void convert_naive(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void convert_unroll4(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i+=4)
{
int v1 = source[i];
int v2 = source[i+1];
int v3 = source[i+2];
int v4 = source[i+3];
destination[i] = v1*factor;
destination[i+1] = v2*factor;
destination[i+2] = v3*factor;
destination[i+3] = v4*factor;
}
}
void convert_sse_intrinsic(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__m64 zero1 = { 0,0 };
__m128i zero2 = { 0,0 };
__m64 *ps = reinterpret_cast<__m64 *>(source);
__m128 *pd = reinterpret_cast<__m128 *>(destination);
for (int i = 0; i < W*H; i+=4)
{
__m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
__m128 fval = _mm_cvtepi32_ps(value);
*pd = _mm_mul_ps(fval, factor); // destination[0,1,2,3] = value[0,1,2,3] * factor;
pd++;
ps++;
}
}
void convert_sse_inlineasm(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__asm__ __volatile__(
"\t pxor %%xmm1, %%xmm1\n"
"\t movaps %3, %%xmm2\n"
"\t mov $0, %%rax\n"
"1:"
"\t movq (%1, %%rax), %%xmm0\n"
"\t movq 8(%1, %%rax), %%xmm3\n"
"\t movq 16(%1, %%rax), %%xmm4\n"
"\t movq 24(%1, %%rax), %%xmm5\n"
"\t punpcklwd %%xmm1, %%xmm0\n"
"\t pslld $16, %%xmm0\n"
"\t psrad $16, %%xmm0\n"
"\t cvtdq2ps %%xmm0, %%xmm0\n"
"\t mulps %%xmm2, %%xmm0\n"
"\t punpcklwd %%xmm1, %%xmm3\n"
"\t pslld $16, %%xmm3\n"
"\t psrad $16, %%xmm3\n"
"\t cvtdq2ps %%xmm3, %%xmm3\n"
"\t mulps %%xmm2, %%xmm3\n"
"\t punpcklwd %%xmm1, %%xmm4\n"
"\t pslld $16, %%xmm4\n"
"\t psrad $16, %%xmm4\n"
"\t cvtdq2ps %%xmm4, %%xmm4\n"
"\t mulps %%xmm2, %%xmm4\n"
"\t punpcklwd %%xmm1, %%xmm5\n"
"\t pslld $16, %%xmm5\n"
"\t psrad $16, %%xmm5\n"
"\t cvtdq2ps %%xmm5, %%xmm5\n"
"\t mulps %%xmm2, %%xmm5\n"
"\t movaps %%xmm0, (%0, %%rax, 2)\n"
"\t movaps %%xmm3, 16(%0, %%rax, 2)\n"
"\t movaps %%xmm4, 32(%0, %%rax, 2)\n"
"\t movaps %%xmm5, 48(%0, %%rax, 2)\n"
"\t addq $32, %%rax\n"
"\t cmpq %2, %%rax\n"
"\t jbe 1b\n"
: /* no outputs */
: "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
"rax", "xmm0", "xmm1", "xmm3");
}
short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif
typedef void (*func)(short *source, float *destination);
struct BmEntry
{
const char *name;
func fn;
};
void bm(BmEntry& e)
{
memset(outbuffer, 0, sizeof(outbuffer));
unsigned long long t = rdtsc();
e.fn(inbuffer, outbuffer);
t = rdtsc() - t;
float sum = 0;
for(int i = 0; i < W * H; i++)
{
sum += outbuffer[i];
}
#if DEBUG
convert_naive(inbuffer, outbuffer2);
for(int i = 0; i < W * H; i++)
{
if (outbuffer[i] != outbuffer2[i])
{
std::cout << i << ":: " << inbuffer[i] << ": "
<< outbuffer[i] << " != " << outbuffer2[i]
<< std::endl;
}
}
#endif
std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t <<
" t/n=" << (double)t / (W * H) << std::endl;
}
#define BM(x) { #x, x }
BmEntry table[] =
{
BM(convert_naive),
BM(convert_unroll4),
BM(convert_sse_intrinsic),
BM(convert_sse_inlineasm),
};
int main()
{
for(int i = 0; i < W * H; i++)
{
inbuffer[i] = (short)i;
}
for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
{
for(int j = 0; j < 5; j++)
bm(table[i]);
}
return 0;
}
No sure if the condition expression in the loop is evaluated only once.
You can try:
float factor= 1.0f/value;
for (int i = 0, count = W*H; i < count; ++i)//25% of time is spent doing this
{
int value = source[i];//short -> int
destination[i] = value*factor;//int->float
}
This is not a valid answer, don't take it as it, but I'm actually wondering how would the code behave by using a 256k look-up table. (basically a 'short to float' table with 65536 entries).
A CoreI7 has about 8 megabytes of cache I believe, so the look-up table would fit in the data cache.
I really wonder how that would impact the performance :)
and You can use OpenMP to hire every core of your CPU, and it is simple just do as following:
#include <omp.h>
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
here is the result based on previous program, just add the like this:
#pragma omp parallel for
for (int it = 0; it < iterations; it++){
...
}
and then here is the result
beta#beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp
beta#beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007
beta#beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3
beta#beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007
. .
result shows 100% improvment with openmp. Visual C++ supports openmp too.
You could try to approximate the expression
float factor = 1.0f/value;
by an fraction numerator/denomitator where both numerator and denominator are ints. This can be done to the precision you need in your application like
int denominator = 10000;
int numerator = factor * denominator;
Then you can do your computation in integer arithmetics like
int value = source[i];
destination[i] = (value * numerator) / numerator;
You have to take care of overflows, perhaps you need to switch to long (or even long long on 64bit systems) for the calculation.
How do I thunk an arbitrary function with an arbitrary (fixed) number of arguments, on x86 and x64?
(I don't need floating-point, SSE, or the like. The arguments are all integers or pointers.)
Here's my generic implementation.
I initially made it with AsmJit, then modified it by hand to remove the dependency.
It works for both x86 and x64!
It works for both cdecl and stdcall!
It should also work for "thiscall", both on VC++ and GCC, but I haven't tested it.
(VC++ would probably not touch the 'this' pointer, whereas GCC would treat it as the first argument.)
It can bind an arbitrary number of arguments at any position in the parameter list!
Just beware:
It does not work for variadic functions, like printf.
Doing so would either require you to provide the number of arguments dynamically (which is painful) or would require you to store the return-pointers somewhere other than the stack, which is complicated.
It was not designed for ultra-high performance, but it should still be fast enough.
The speed is O(total parameter count), not O(bound parameter count).
Scroll to the right to see the assembly code.
#include <stddef.h>
size_t vbind(
void *(/* cdecl, stdcall, or thiscall */ *f)(), size_t param_count,
unsigned char buffer[/* >= 128 + n * (5 + sizeof(int) + sizeof(void*)) */],
size_t const i, void *const bound[], unsigned int const n, bool const thiscall)
{
unsigned char *p = buffer;
unsigned char s = sizeof(void *);
unsigned char b = sizeof(int) == sizeof(void *) ? 2 : 3; // log2(sizeof(void *))
*p++ = 0x55; // push rbp
if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xEC; // mov rbp, rsp
if (b > 2)
{
*p++ = 0x48; *p++ = 0x89; *p++ = 0x4C; *p++ = 0x24; *p++ = 2 * s; // mov [rsp + 2 * s], rcx
*p++ = 0x48; *p++ = 0x89; *p++ = 0x54; *p++ = 0x24; *p++ = 3 * s; // mov [rsp + 3 * s], rdx
*p++ = 0x4C; *p++ = 0x89; *p++ = 0x44; *p++ = 0x24; *p++ = 4 * s; // mov [rsp + 4 * s], r8
*p++ = 0x4C; *p++ = 0x89; *p++ = 0x4C; *p++ = 0x24; *p++ = 5 * s; // mov [rsp + 5 * s], r9
}
if (b > 2) { *p++ = 0x48; } *p++ = 0xBA; *(*(size_t **)&p)++ = param_count; // mov rdx, <param_count>
if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xC2; // mov rax, rdx
if (b > 2) { *p++ = 0x48; } *p++ = 0xC1; *p++ = 0xE0; *p++ = b; // shl rax, log2(sizeof(void *))
if (b > 2) { *p++ = 0x48; } *p++ = 0x2B; *p++ = 0xE0; // sub rsp, rax
*p++ = 0x57; // push rdi
*p++ = 0x56; // push rsi
*p++ = 0x51; // push rcx
*p++ = 0x9C; // pushfq
if (b > 2) { *p++ = 0x48; } *p++ = 0xF7; *p++ = 0xD8; // neg rax
if (b > 2) { *p++ = 0x48; } *p++ = 0x8D; *p++ = 0x7C; *p++ = 0x05; *p++ = 0x00; // lea rdi, [rbp + rax]
if (b > 2) { *p++ = 0x48; } *p++ = 0x8D; *p++ = 0x75; *p++ = 2 * s; // lea rsi, [rbp + 10h]
if (b > 2) { *p++ = 0x48; } *p++ = 0xB9; *(*(size_t **)&p)++ = i; // mov rcx, <i>
if (b > 2) { *p++ = 0x48; } *p++ = 0x2B; *p++ = 0xD1; // sub rdx, rcx
*p++ = 0xFC; // cld
*p++ = 0xF3; if (b > 2) { *p++ = 0x48; } *p++ = 0xA5; // rep movs [rdi], [rsi]
for (unsigned int j = 0; j < n; j++)
{
unsigned int const o = j * sizeof(p);
if (b > 2) { *p++ = 0x48; } *p++ = 0xB8; *(*(void ***)&p)++ = bound[j]; // mov rax, <arg>
if (b > 2) { *p++ = 0x48; } *p++ = 0x89; *p++ = 0x87; *(*(int **)&p)++ = o; // mov [rdi + <iArg>], rax
}
if (b > 2) { *p++ = 0x48; } *p++ = 0xB8; *(*(size_t **)&p)++ = n; // mov rax, <count>
if (b > 2) { *p++ = 0x48; } *p++ = 0x2B; *p++ = 0xD0; // sub rdx, rax
if (b > 2) { *p++ = 0x48; } *p++ = 0xC1; *p++ = 0xE0; *p++ = b; // shl rax, log2(sizeof(void *))
if (b > 2) { *p++ = 0x48; } *p++ = 0x03; *p++ = 0xF8; // add rdi, rax
if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xCA; // mov rcx, rdx
*p++ = 0xF3; if (b > 2) { *p++ = 0x48; } *p++ = 0xA5; // rep movs [rdi], [rsi]
*p++ = 0x9D; // popfq
*p++ = 0x59; // pop rcx
*p++ = 0x5E; // pop rsi
*p++ = 0x5F; // pop rdi
if (b > 2)
{
*p++ = 0x48; *p++ = 0x8B; *p++ = 0x4C; *p++ = 0x24; *p++ = 0 * s; // mov rcx, [rsp + 0 * s]
*p++ = 0x48; *p++ = 0x8B; *p++ = 0x54; *p++ = 0x24; *p++ = 1 * s; // mov rdx, [rsp + 1 * s]
*p++ = 0x4C; *p++ = 0x8B; *p++ = 0x44; *p++ = 0x24; *p++ = 2 * s; // mov r8 , [rsp + 2 * s]
*p++ = 0x4C; *p++ = 0x8B; *p++ = 0x4C; *p++ = 0x24; *p++ = 3 * s; // mov r9 , [rsp + 3 * s]
*p++ = 0x48; *p++ = 0xB8; *(*(void *(***)())&p)++ = f; // mov rax, <target_ptr>
*p++ = 0xFF; *p++ = 0xD0; // call rax
}
else
{
if (thiscall) { *p++ = 0x59; } // pop rcx
*p++ = 0xE8; *(*(ptrdiff_t **)&p)++ = (unsigned char *)f - p
#ifdef _MSC_VER
- s // for unknown reasons, GCC doesn't like this
#endif
; // call <fn_rel>
}
if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xE5; // mov rsp, rbp
*p++ = 0x5D; // pop rbp
*p++ = 0xC3; // ret
return p - &buffer[0];
}
Example (for Windows):
#include <assert.h>
#include <stdio.h>
#include <Windows.h>
void *__cdecl test(void *value, void *x, void *y, void *z, void *w, void *u)
{
if (u > 0) { test(value, x, y, z, w, (void *)((size_t)u - 1)); }
printf("Test called! %p %p %p %p %p %p\n", value, x, y, z, w, u);
return value;
}
struct Test
{
void *local;
void *operator()(void *value, void *x, void *y, void *z, void *w, void *u)
{
if (u > 0) { (*this)(value, x, y, z, w, (void *)((size_t)u - 1)); }
printf("Test::operator() called! %p %p %p %p %p %p %p\n", local, value, x, y, z, w, u);
return value;
}
};
int main()
{
unsigned char thunk[1024]; unsigned long old;
VirtualProtect(&thunk, sizeof(thunk), PAGE_EXECUTE_READWRITE, &old);
void *args[] = { (void *)0xBAADF00DBAADF001, (void *)0xBAADF00DBAADF002 };
void *(Test::*f)(void *value, void *x, void *y, void *z, void *w, void *u) = &Test::operator();
Test obj = { (void *)0x1234 };
assert(sizeof(f) == sizeof(void (*)())); // virtual function are too big, they're not supported :(
vbind(*(void *(**)())&f, 1 + 6, thunk, 1 + 1, args, sizeof(args) / sizeof(*args), true);
((void *(*)(void *, int, int, int, int))&thunk)(&obj, 3, 4, 5, 6);
vbind((void *(*)())test, 6, thunk, 1, args, sizeof(args) / sizeof(*args), false);
((void *(*)(int, int, int, int))&thunk)(3, 4, 5, 6);
}
Here is a modification for thiscall functions
The vbind() stub generator above is meant to be used for C++ member functions as well, although it is not clear how to proceed. Here's what I've come up with:
// experimental x64 thiscall thunking
class TestHook {
public:
typedef void (TestHook::*TMFP)();
TestHook(DWORD num)
{
m_context = num;
union { void* (*func)(); TMFP method; } addr;
addr.method = (TMFP)CBTHook_stub;
// pass "this" as the first fixed argument
void *args[] = { this };
size_t thunk_size = vbind(addr.func, 4, m_thunk, 0, args, 1);
ATLASSERT(thunk_size < sizeof(m_thunk));
unsigned long old;
VirtualProtect(m_thunk, thunk_size, PAGE_EXECUTE_READWRITE, &old);
FlushInstructionCache(GetCurrentProcess(), m_thunk, thunk_size);
}
FARPROC GetThunk() const { return (FARPROC)(void*)m_thunk; }
protected:
// test thiscall: one integer and two 8-byte arguments
LRESULT CBTHook_stub(int nCode, WPARAM wParam, LPARAM lParam)
{
ATLTRACE(_T("this=%p, code=%d, wp=%x, lp=%x, context=%x\n"), this, nCode, wParam, lParam, m_context);
return lParam;
}
DWORD m_context;
unsigned char m_thunk[1024]; // fixed; don't know size required apriori!
};
#ifndef _WIN64
#error does not work for win32
#endif
void main(void)
{
TestHook tmp(0xDeadBeef);
HOOKPROC proc = (HOOKPROC)tmp.GetThunk();
ATLTRACE(_T("object %p return value=%d\n"), &tmp, proc(1, 2, 3));
}
I am not an assembly gury but this code correctly stubs into the member function for 64 bit code. There are some implicit assumptions (I'm not 100% sure if valid, please correct me if I'm wrong):
in x64 (amd / microsoft VS) all function arguments are passed as 8 bytes long. So although vbind was just for pointer-type arguments, it is possible to thunk into other function prototypes (e.g. the HOOKPROC takes one integer and two __int64)
"this" pointer is passed as the first stack argument in x64 instead of ECX. I used the bounded argument to pass "this" pointer and provide context to the C++ object