a newbie in coding, really need your advice......
Recently I'm been trying some SSE coding to speed up simple calculations (addition and multiplication), I've been told there will be a 2x more speed boost with SSEx. But my result shows only a 1.25x boost, is there anything wrong with my code?
I've tried declaring the input arrays as global variables to maintain address continuity,not using local variables in SSE part, both in vain.
The following is the code,compiling with
g++ -mfpath=sse -mmmx -msse -msse2 -msse4.1 -O -Wall test.c
#define N 32768
#include<stdio.h>
#include<stdlib.h>
#include<stdint.h>
#include <smmintrin.h> //sse4.1
#include <emmintrin.h> //sse2
#include <xmmintrin.h> //sse
#include <mmintrin.h> //mmx
#include <time.h>
#include <string.h>
void init_with_rand(float *array);
float input1[N];
float input2[N];
float input3[N];
float output1[N];
float output2[N];
__m128 A,B,C,MUX,SUM;
int main(void)
{
clock_t t1, t2;
int i,j;
init_with_rand(input1);
init_with_rand(input2);
init_with_rand(input3);
t1 = clock();
for(j = 0; j < 1000000; j++){
for(i = 0; i < N; i++){
output1[i] = input1[i] * input2[i] + input3[i];
}
}
t1 = clock()-t1;
printf ("It took me %d clicks (%f seconds).\n",t1,((float)t1)/CLOCKS_PER_SEC);
/////////////////////////////////////////////////////////////////////////////////
t2 = clock();
for(j = 0; j < 1000000; j++){
for(i = 0; i < N; i+=4){
A = _mm_load_ps(input1+i);
B = _mm_load_ps(input2+i);
C = _mm_load_ps(input3+i);
MUX = _mm_mul_ps(A, B);
SUM = _mm_add_ps( MUX , C);
_mm_store_ps(output2+i, SUM);
}
}
t2 = clock()-t2;
printf ("It took me %d clicks (%f seconds).\n",t2,((float)t2)/CLOCKS_PER_SEC);
printf ("Performance is increased by %f times.\n",((float)t1/(float)t2));
if(!memcmp(output1,output2,N))
printf("Valid\n");
else if(memcmp(output1,output2,N))
printf("Invalid\n");
else
printf("Error\n");
return 0;
}
void init_with_rand(float *array)
{
int i;
for( i = 0; i < N; i++)
array[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
}
Thanks for any suggestion!
Related
I am trying to understand if in C++11 new/delete are thread-safe.
I have found conflicting answers.
I am running this short program and sometimes I get different results from the two threads (I would expect to always get the same result instead).
Is this due to issues in memory allocation? What am I missing?
I tried with malloc/free, same behaviour.
I am compiling it with:
g++ -o out test_thread.cpp -std=c++11 -pthread
g++ (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
Thanks.
#include <string>
#include <iostream>
#include <thread>
#include <stdlib.h>
void task(int id)
{
int N = 10000;
srand(100);
int j;
long tot = 0;
int *v = new int[N];
/* int *v = 0;
v = (int *) malloc (N * sizeof(int));
*/
for (j = 0; j < N; j++)
v[j] = rand();
for (j = 0; j < N; j++)
tot += v[j];
//free(v);
delete [] v;
printf("Thread #%d: total %ld\n", id, tot);
}
int main()
{
std::thread t1(task, 1);
std::thread t2(task, 2);
t1.join();
t2.join();
}
rand() shares state between threads; that already accounts for your observations.
I have been trying to parallelize computing the sum value of series using certain number of terms to the processors using block allocation.
In this program, I am generating arithmetic series and want to pass array as a shared variable in the pragma and trying to restructure the pragma parallel directive.
I am new to OPENMP-C. Kindly help me how to insert array value as a shared variable and stabilize the code. I am attaching the code below
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int main (int argc, char *argv[])
{
int rank, comm_sz;
int number, i, first, difference, global_sum1, global_sum, nprocs, step, local_sum1, local_n;
int* a;
int BLOCK_LOW, BLOCK_HIGH;
double t0, t1;
comm_sz = atoi(argv[1]);
first = atoi(argv[2]);
difference = atoi(argv[3]);
number = atoi(argv[4]);
omp_set_num_threads (comm_sz);
rank = omp_get_thread_num();
a = (int*) malloc (n*sizeof(int));
printf("comm_sz=%d, first=%d, difference=%d, number of terms=%d\n",comm_sz, first, difference, number);
for(i=1; i <= number; i++){
a[i-1] = first + (i-1)*difference;
printf("a[%d]=%d\n",i-1,a[i]);
}
for(i=0; i < number; i++){
printf("a[%d]=%d\n",i,a[i]);}
t0 = omp_get_wtime();
#pragma omp parallel omp_set_num_threads(comm_sz, number, comm_sz, first, difference, global_sum1)
{
BLOCK_LOW = (rank * number)/comm_sz;
BLOCK_HIGH = ((rank+1) * number)/comm_sz;
#pragma omp parallel while private(i, local_sum1)
//int local_sum1 = 0;
i=BLOCK_LOW;
while( i < BLOCK_HIGH )
{
printf("%d, %d\n",BLOCK_LOW,BLOCK_HIGH);
local_sum1 = local_sum1 + a[i];
i++;
}
//global_sum1 = global_sum1 + local_sum1;
#pragma omp while reduction(+:sum1)
i=0;
for (i < comm_sz) {
global_sum1 = global_sum1 + local_sum1;
i++;
}
}
step = 2*first + (n-1)*difference;
sum = 0.5*n*step;
printf("sum is %d\n", global_sum );
t1 = omp_get_wtime();
printf("Estimate of pi: %7.5f\n", global_sum1);
printf("Time: %7.2f\n", t1-t0);
}
There are several mistakes in your code. I've tried to infer what you would like to do. So, I have rewritten your code according to my understanding.
Here is my suggestion:
int main (int argc, char *argv[])
{
int comm_sz, number, i, first, difference, global_sum, step;
int* a;
double t0, t1, sum;
comm_sz = atoi(argv[1]);
first = atoi(argv[2]);
difference = atoi(argv[3]);
number = atoi(argv[4]);
omp_set_num_threads (comm_sz);
a = (int*) malloc (number*sizeof(int));
printf("comm_sz=%d, first=%d, difference=%d, number of terms=%d\n",comm_sz, first, difference, number);
for(i=0; i < number; i++){
a[i] = first + (i)*difference;
printf("a[%d]=%d\n",i,a[i]);
}
t0 = omp_get_wtime();
global_sum = 0;
#pragma omp parallel for private(i) reduction(+:global_sum)
for (i=0; i < number; i++){
global_sum += a[i];
}
step = 2*first + (number-1)*difference;
sum = 0.5*number*step;
t1 = omp_get_wtime();
printf("sum is %d\n", global_sum);
printf("Estimate of pi: %7.5f\n", sum);
printf("Time: %7.2f\n", t1-t0);
}
I want to benchmark this simple C code:
float f(float x[], float y[]) {
float p = 0;
for (int i = 0; i <64; i++)
p += x[i] * y[i];
return p;
}
My motivation is to try different compiler flags and also gcc and clang to see what difference it makes.
I found this test framework and have been trying to get it to work. Although I am completely new to C++, here is my best effort:
#include <benchmark.h>
#include <benchmark_api.h>
#include <cstdio>
#include <random>
std::random_device seed;
std::mt19937 gen(seed());
float f(float* x, float* y) {
float p = 0;
for (int i = 0; i <64; i++) {
p += x[i] * y[i];
}
return p;
}
void f_benchmark(benchmark::State& state) {
while (state.KeepRunning()) {
benchmark::DoNotOptimize(f((float*) state.range(0), (float*) state.range(1)));
}
}
void args(benchmark::internal::Benchmark* b) {
std::uniform_real_distribution<float> rand(0, 100);
for (int i = 0; i < 10; i++) {
float* x = new float[64];
float* y = new float[64];
for (int i = 0; i < 64; i++) {
x[i] = rand(gen);
y[i] = rand(gen);
printf("%f %f\n", x[i], y[i]);
}
b->Args({(int) x, (int) y});
}
}
BENCHMARK(f_benchmark)->Apply(args);
BENCHMARK_MAIN();
To compile it I do:
g++ -Ofast -Wall -std=c++11 test.cpp -Ibenchmark/include/benchmark/
-Lbenchmark/src/ -o test -lbenchmark -lpthread
This gives me :
test.cpp: In function ‘void f_benchmark(benchmark::State&)’:
test.cpp:20:54: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
benchmark::DoNotOptimize(f((float*) state.range(0), (float*) state.range(1)));
[...]
test.cpp: In function ‘void args(benchmark::internal::Benchmark*)’:
test.cpp:38:20: error: cast from ‘float*’ to ‘int’ loses precision [-fpermissive]
b->Args({(int) x, (int) y});
^
[...]
How can I get rid of those warnings and in general, am I doing this
right?
Your code casts a float* to int and back to a float* - this can cause problems, because sizeof(int) and sizeof(float*) are not guaranteed to be identical (i.e. on x86-64 int is 32bit, while float* is 64bit!).
The reason why you run into this issue, is probably because Args() supports only int arguments (they're supposed to be used as index for a family of benchmarks, not as actual function arguments in your function). To use parameters of a different type you could:
A. use global variables to store the pre-calculated random array
i.e.
#include <benchmark.h>
#include <benchmark_api.h>
#include <cstdio>
#include <random>
std::random_device seed;
std::mt19937 gen(seed());
float x[64*10], y[64*10];
float f(float* x, float* y) {
float p = 0;
for (int i = 0; i <64; i++) {
p += x[i] * y[i];
}
return p;
}
void f_benchmark(benchmark::State& state) {
while (state.KeepRunning()) {
benchmark::DoNotOptimize(f(&x[state.range(0)*64], &y[state.range(0)*64]));
}
}
void args(benchmark::internal::Benchmark* b) {
std::uniform_real_distribution<float> rand(0, 100);
for (int i = 0; i < 64*10; i++) {
x[i] = rand(gen);
y[i] = rand(gen);
}
for (int i = 0; i < 10; ++i)
b->Arg({ i });
}
BENCHMARK(f_benchmark)->Apply(args);
BENCHMARK_MAIN();
B. calculate the random numbers as part of the benchmark function (choose this approach if, you really require different random values for each iteration - the timing needs to be paused / resumed accordingly to not include the time for the random generation/memory allocation in the benchmark)
i.e.
#include <benchmark.h>
#include <benchmark_api.h>
#include <cstdio>
#include <random>
std::random_device seed;
std::mt19937 gen(seed());
float f(float* x, float* y) {
float p = 0;
for (int i = 0; i <64; i++) {
p += x[i] * y[i];
}
return p;
}
void f_benchmark(benchmark::State& state) {
state.PauseTiming();
std::uniform_real_distribution<float> rand(0, 100);
float* x = new float[64];
float* y = new float[64];
while (state.KeepRunning()) {
for (int i = 0; i < 64; i++) {
x[i] = rand(gen);
y[i] = rand(gen);
}
state.ResumeTiming();
benchmark::DoNotOptimize(f(x, y));
state.PauseTiming();
}
delete[] x;
delete[] y;
}
BENCHMARK(f_benchmark)->Apply([](benchmark::internal::Benchmark* b){
for (int i = 0; i < 10; ++i)
b->Arg({ i });
});
BENCHMARK_MAIN();
Side note: Also take care about the the leaking memory in your for loop - you should call the delete[] operator once for every new[] operator.
I am asked to vectorize a larger program. Before I started with the big program I wanted to see the effect of vectorization in isolated case. For this I created two programs that should show the idea of the outstanding transformation. One with an array of structs (no vec) and struct of arrays (with vec). I expected that the soa would outperform the aos by far, but it doesn't.
measured program loop A
for (int i = 0; i < NUM; i++) {
ptr[i].c = ptr[i].a + ptr[i].b;
}
full program:
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
struct myStruct {
double a, b, c;
};
#define NUM 100000000
high_resolution_clock::time_point t1, t2, t3;
int main(int argc, char* argsv[]) {
struct myStruct *ptr = (struct myStruct *) malloc(NUM * sizeof(struct myStruct));
for (int i = 0; i < NUM; i++) {
ptr[i].a = i;
ptr[i].b = 2 * i;
}
t1 = high_resolution_clock::now();
for (int i = 0; i < NUM; i++) {
ptr[i].c = ptr[i].a + ptr[i].b;
}
t2 = high_resolution_clock::now();
long dur = duration_cast<microseconds>( t2 - t1 ).count();
cout << "took "<<dur << endl;
double sum = 0;
for (int i = 0; i < NUM; i++) {
sum += ptr[i].c;
}
cout << "sum is "<< sum << endl;
}
measured program loop B
#pragma simd
for (int i = 0; i < NUM; i++) {
C[i] = A[i] + B[i];
}
full program:
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
#include <omp.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
#define NUM 100000000
high_resolution_clock::time_point t1, t2, t3;
int main(int argc, char* argsv[]) {
double *A = (double *) malloc(NUM * sizeof(double));
double *B = (double *) malloc(NUM * sizeof(double));
double *C = (double *) malloc(NUM * sizeof(double));
for (int i = 0; i < NUM; i++) {
A[i] = i;
B[i] = 2 * i;
}
t1 = high_resolution_clock::now();
#pragma simd
for (int i = 0; i < NUM; i++) {
C[i] = A[i] + B[i];
}
t2 = high_resolution_clock::now();
long dur = duration_cast<microseconds>( t2 - t1 ).count();
cout << "Aos "<<dur << endl;
double sum = 0;
for (int i = 0; i < NUM; i++) {
sum += C[i];
}
cout << "sum "<<sum;
}
I compile with
icpc vectorization_aos.cpp -qopenmp --std=c++11 -cxxlib=/lrz/mnt/sys.x86_64/compilers/gcc/4.9.3/
icpc (v16)
compiled and executed on an Intel(R) Xeon(R) CPU E5-2697 v3 # 2.60GHz
in my test cases program A takes around 300ms, B 350ms. If I add unnecessary additional data to the struct in A it becomes increasingly slower (as more memory has to be loaded)
the -O3 flag does not have any impact on run-time
removing the #pragma simd directive does also not have impact. So either its auto vectorized or my vectorization does not work at all.
Questions:
am I missing something? Is this the way how one would vectorize a program?
Why is program 2 slower? Maybe the program is both times just memory bandwidth bound and I need to increase the computation density?
Are there programs/ code snippets that show the impact of vecotrization better and how can I verify that my program is actually executed vectorized.
i have a pretty weird problem regarding SSE usage.
I wrote the following function where i use SSE to calculate the maximum of the difference of two float arrays, each containing 64 floats.
The dists-array is a 2d-array allocated via _aligned_malloc.
#include <iostream>
#include <xmmintrin.h>
#include <time.h>
#include <stdio.h>
#include <algorithm>
#include <fstream>
#include "hr_time.h"
using namespace std;
float** dists;
float** dists2;
__m128* a;
__m128* b;
__m128* c;
__m128* d;
__m128 diff;
__m128 diff2;
__m128 mymax;
float* myfmax;
float test(int s, int t)
{
a = (__m128*) dists[s];
b = (__m128*) dists[t];
c = (__m128*) dists2[s];
d = (__m128*) dists2[t];
diff;
mymax = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
for (int i = 0; i <= 16; i++)
{
diff = _mm_sub_ps(*a, *b);
mymax = _mm_max_ps(diff, mymax);
diff2 = _mm_sub_ps(*d, *c);
mymax = _mm_max_ps(diff2, mymax);
a++;
b++;
c++;
d++;
}
_mm_store_ps(myfmax, mymax);
float res = max(max(max(myfmax[0], myfmax[1]), myfmax[2]), myfmax[3]);
return res;
}
int Deserialize(std::istream* stream)
{
int numOfElements, arraySize;
stream->read((char*)&numOfElements, sizeof(int)); // numOfElements = 64
stream->read((char*)&arraySize, sizeof(int)); // arraySize = 8000000
dists = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
dists2 = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
for (int j = 0; j < arraySize; j++)
{
dists[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
dists2[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
}
for (int i = 0; i < arraySize; i++)
{
stream->read((char*)dists[i], (numOfElements*sizeof(float)));
}
for (int i = 0; i < arraySize; i++)
{
stream->read((char*)dists2[i], (numOfElements*sizeof(float)));
}
return 0;
}
int main(int argc, char** argv)
{
int entries = 8000000;
myfmax = (float*)_aligned_malloc(4 * sizeof(float), 16);
ifstream fs("binary_file", std::ios::binary);
Deserialize(&fs);
CStopWatch* watch = new CStopWatch();
watch->StartTimer();
int i;
for (i = 0; i < entries; i++)
{
int s = rand() % entries;
int t = rand() % entries;
test(s, t);
}
watch->StopTimer();
cout << i << " iterations took " << watch->GetElapsedTimeMs() << "ms" << endl;
cin.get();
}
My problem is, that this code runs very fast if i run it in Visual Studio with an attached debugger. But as soon as i execute it without the debugger it gets very slow.
So i did a little reasearch and found out that one difference between those two starting methods is the "Debug Heap". So i disabled that by defining "_NO_DEBUG_HEAP=1". With that option i get very poor performance with an attached debugger too.
But i don't understand how i can get better performance by using the Debug Heap? And i don't know how to solve this problem, so i hope one of you guys can help me.
Thanks in advance.
Regards,
Karsten
Your code has a bug. _mm_store_ps stores an array of four floats but you only declare one. The compiler should not even allow you do to that.
Change
float fmax;
_mm_store_ps(fmax, max);
pi = std::max(std::max(std::max(fmax[0], fmax[1]), fmax[2]), fmax[3]);
to
float __declspec(align(16)) fmax[4];
_mm_store_ps(fmax, max);
return std::max(std::max(std::max(fmax[0], fmax[1]), fmax[2]), fmax[3]);