visual studio 2012 parallelism with openmp

visual studio 2012 parallelism with openmp - c++

I am studying computer architecture in the university.
I have a home work which making convolution faster using parallelism(openMP).
For now I made convolution code (your_convolution) with omp, but It did not be faster at all!
I'm using visual studio 2012.
How can i make it faster??
here's whole convolution's code.
give me some help.
#include <intrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <omp.h>
using namespace std;
void convolution(float* output, float* input, float* filter, int width, int height, int r)
{
assert(output!=NULL && input!=NULL && filter!=NULL && width>0 && height>0 && r>0);
int w1=width-1;
int h1=height-1;
int fwidth=2*r+1;
int i, j, di, dj, ii, jj;
float sum;
for (i=0;i<height;++i)
{
for (j=0;j<width;++j)
{
sum=0;
for (di=-r;di<=r;++di)
{
ii=i+di;
ii=max(min(ii,h1),0);
for (dj=-r;dj<=r;++dj)
{
jj=j+dj;
jj=max(min(jj,w1),0);
sum+=filter[dj+r+(di+r)*fwidth]*input[jj+ii*width];
}
}
output[j+i*width]=sum;
}
}
}
void your_convolution(float* output, float* input, float* filter, int width, int height, int r)
{
// write your code here //
assert(output != NULL && input != NULL && filter != NULL && width>0 && height>0 && r>0);
int w1 = width - 1;
int h1 = height - 1;
int fwidth = 2 * r + 1;
int i, j, di, dj, ii, jj;
float sum;
omp_set_num_threads(4);
#pragma omp parallel
{
for (i = 0; i<height; ++i)
{
for (j = 0; j<width; ++j)
{
sum = 0;
for (di = -r; di <= r; ++di)
{
ii = i + di;
ii = max(min(ii, h1), 0);
#pragma omp parallel for
for (dj = -r; dj <= r; ++dj)
{
jj = j + dj;
jj = max(min(jj, w1), 0);
sum += filter[dj + r + (di + r)*fwidth] * input[jj + ii*width];
}
}
output[j + i*width] = sum;
}
}
}
}
int main()
{
// load the image
int width=1920; // width of the image
int height=1080; // height of the image
int len=width*height; // pixels in the image
int i, j, ii, jj, i2;
float* data=(float*)malloc(sizeof(float)*len); // buffer to load the image
float* output=(float*)malloc(sizeof(float)*len); // output buffer
FILE* fp=fopen("../image.dat", "rb"); // open the image, assume that the bld directory is a subdirectory to the src directory
fread(data, sizeof(float), width*height, fp); // load the float values, the image is gray.
fclose(fp);
// set the filter
int radius=3; // filter radius
float sigma=(float)(radius/3.0); // standard deviation of the Gaussian filter
float beta=(float)(-0.5/(sigma*sigma)); // coefficient exp(beta*x*x)
int fwidth=2*radius+1; // width of the filter
int flen=fwidth*fwidth; // number of elements in the filter
float* filter=(float*)malloc(sizeof(float)*flen); // filter buffer
float sum_weight=0; // we want to normalize the filter weights
for (i=-radius;i<=radius;++i)
{
ii=(i+radius)*fwidth;
i2=i*i;
for (j=-radius;j<=radius;++j)
{
jj=j+radius+ii;
filter[jj]=exp(beta*(i2+j*j));
sum_weight+=filter[jj];
}
}
sum_weight=(float)(1.0/sum_weight);
for (i=0;i<flen;++i)
filter[i]*=sum_weight; // now the weights are normalized to sum to 1
clock_t start=clock();
convolution(output, data, filter, width, height, radius);
clock_t finish=clock();
double duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "convolution naive: %2.3f seconds\n", duration );
float* output2=(float*)malloc(sizeof(float)*len); // output buffer
start=clock();
your_convolution(output2, data, filter, width, height, radius);
finish=clock();
double duration2 = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "your convolution: %2.3f seconds\n", duration2 );
double sum=0;
for (i=0;i<len;++i)
sum+=fabs(output[i]-output2[i]);
printf("difference of the outputs=%lf\n", sum);
printf( "The performance of your convolve is %2.1f times higher than convolution naive.\n", duration/duration2);
free(data);
free(filter);
free(output);
return 0;
}

Related

FFTW returns different results in some loops

I'm new to use C++.
I have tried to implement FFT using Eigen and fftw3 (version 3.3.10).
The purpose is to read measurement data from CSV file and analyze FFT.
However, I realized that fftw returns different results some time (sometimes return results correctly, sometimes wrong results...) in some loops if do some analysis again and again.
The problem might be just a memory leaking or casting variable problem as I'm new to use C++.
I'm very appreciated if you give me any advice or tips.
Thanks in advance!
#define EIGEN_FFTW_DEFAULT
#include <iostream>
#include <string>
#include <vector>
#include <cmath>
#include <fstream>
#include <sstream>
#include "Eigen/Dense"
#include <fftw3.h>
using namespace Eigen;
// define functions
template <typename T>
T readCSV(const std::string &path);
int nextpow2(int n);
VectorXd offsetData(VectorXd v);
void fftw_test(VectorXd x);
// calculate exponent of next higher power of 2
int nextpow2(int n)
{
if (n < 0) // n must be int
return 0;
if (n == 1) // n
return 1;
return (int)floor(log2(n - 1)) + 1.0;
};
// Read Measurement Data
template <typename T>
T readCSV(const std::string &path)
{
// https://stackoverflow.com/questions/34247057/how-to-read-csv-file-and-assign-to-eigen-matrix
std::ifstream file;
std::string line;
std::string cell;
std::vector<double> row;
uint rows = 0;
file.open(path);
std::cout << "Opend file: " << path << std::endl;
std::getline(file, line); // skip the first header line
while (std::getline(file, line))
{
std::stringstream lineStream(line);
while (std::getline(lineStream, cell, ','))
{
row.push_back(std::stod(cell)); // insert value as double
}
++rows;
}
return Map<const Matrix<typename T::Scalar, T ::RowsAtCompileTime, T::ColsAtCompileTime, RowMajor> >(row.data(), rows, row.size() / rows);
};
void fftw_test(VectorXd x)
{
// Convert data unit
x = x * 980.665 * 10; // Unit conversion：[G] to [cm/sec^2] to [mm/sec^2]
int ns = x.size(); // number of samples
int nfft = std::pow(2, nextpow2(ns)); // number of fft
// Zero padding to array
VectorXd xpad;
int npad = nfft - ns;
if (npad > 0)
{
xpad = VectorXd(nfft);
for (int i = 0; i < ns; ++i)
{
xpad(i) = x(i);
}
}
else
{
xpad = x;
}
// /* prepare a cosine wave */
// for (i = 0; i < N; i++)
// {
// in[i][0] = cos(3 * 2 * M_PI * i / N);
// in[i][1] = 0;
// }
int N = nfft;
fftw_complex *in, *out, *in2;
in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
in2 = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan p, q;
for (int i = 0; i < N; i++)
{
in[i][0] = (double)xpad(i);
in[i][1] = 0;
}
p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT);
fftw_execute(p);
for (int i = 0; i < 10; i++)
{
printf("in: %3d %+9.5f %+9.5f I\n", i, in[i][0], in[i][1]);
}
for (int i = 0; i < 10; i++)
{
printf("freq: %3d %+9.5f %+9.5f I\n", i, out[i][0], out[i][1]);
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
fftw_free(in2);
fftw_cleanup();
};
VectorXd offsetData(VectorXd v)
{
// Offset by mean values
int ns = v.size(); // number of samples
VectorXd ones = MatrixXd::Ones(ns, 1);
v = v - v.mean() * ones;
return v;
};
int main()
{
// Read measured data from csv file
MatrixXd measuredData = readCSV<MatrixXd>("./sampleCsv/20220208-134655_A351AU.csv");
// Extract a vertical acceleration column
VectorXd Acc = measuredData.col(4);
VectorXd Acc_offset = offsetData(Acc / 1000);
for (int i = 0; i < 100; ++i)
{
// fftw bug test
printf("loop: %ith \n", i);
fftw_test(Acc_offset);
};
return 0;
}
The sample CSV file is here.
https://drive.google.com/file/d/1DQO2eeMX7AfxjnuW8DDJMOitxHNuIDHA/view?usp=sharing
The correct results should be below.
freq: 0 -0.00000 +0.00000 I
freq: 1 +320.64441 -83.56961 I
freq: 2 -113.66004 -195.80680 I
freq: 3 -28.57778 -13.57046 I
freq: 4 -47.71908 +185.43538 I
freq: 5 +381.01770 +92.18739 I
freq: 6 +430.73267 -348.16464 I
freq: 7 -111.55714 -796.10333 I
freq: 8 -810.79331 -273.42916 I
freq: 9 -624.83461 +607.38775 I

How to speed up my code by openmp with multi for loops

Now, l am trying to accelerate the calculation of center of mass in different ROI areas.Her is my original code:
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv/cv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "omp.h"
#include <time.h>
#include <ctime>
#define File_SubAperture "SubAperture.txt"
#define Row_Subapaerture 750
Mat src;
Mat src_gray;
using namespace cv;
using namespace std;
int thresh = 0;
int max_thresh = 255;
Mat ROI;
int nn = 0;
float subApX[751] = { 0 };
float subApY[751] = { 0 };
float x = 0.0;
float y = 0.0;
float k = 0.0;
float f = 0.0;
int main()
{
clock_t startTime, endTime;
int SubAperture[Row_Subapaerture][Col_Subaperture];
double data1, data2, data3, data4;
int i;
Rect rect_subaperture[Row_Subapaerture];
FILE * fp_SubAperture;
FILE* px;
px = fopen("C:\\Users\\DELL\\Desktop\\AO acceleration\\center-coordinates-x.txt", "w+");
FILE* py;
py=fopen("C:\\Users\\DELL\\Desktop\\AO acceleration\\center-coordinates-y.txt", "w+");
//---read file Sub-aperture.txt---*
fp_SubAperture = fopen(File_SubAperture, "r");
if (fp_SubAperture == NULL)
{
perror("Couldn't open the file " File_SubAperture);
exit(1);
}
for (i = 0; fscanf(fp_SubAperture, "%lf%lf%lf%lf", &data1, &data2, &data3, &data4) != EOF; ++i)
{
SubAperture[i][0] = (int)data1;
SubAperture[i][1] = (int)data2;
SubAperture[i][2] = (int)data3;
SubAperture[i][3] = (int)data4;
}
fclose(fp_SubAperture);
//read image
float sumval = 0.0;
MatIterator_<uchar> it, end;
src = imread("WFS_29x29-circle.png", CV_LOAD_IMAGE_COLOR);
cvtColor(src, src_gray, CV_BGR2GRAY);
//imshow("gray image", src_gray);
//calculate the ROI area in advance
for (i = 0; i < 749; i++)
{
rect_subaperture[i].x = SubAperture[i][0];
rect_subaperture[i].y = SubAperture[i][1];
//rect_subaperture[i].width = SubAperture[i][2] - SubAperture[i][0];
//rect_subaperture[i].height = SubAperture[i][3] - SubAperture[i][1];
rect_subaperture[i].width = 4;
rect_subaperture[i].height = 4;
}
startTime = clock();// time start
omp_set_num_threads(2);
#pragma omp parallel private(ROI,it,i,k,f ) firstprivate(sumval,x,y) shared(src_gray,subApX,subApY,rect_subaperture)
#pragma omp for nowait schedule(guided) collapse(2)
for(i=0; i<749;i++)
{
ROI = src_gray(rect_subaperture[i]);
for (it = ROI.begin<uchar>(), end = ROI.end<uchar>(); it != end; it++)
{
((*it) > thresh) ? sumval += (*it) : NULL;
// printf("sum = %f\n", sumval);
}
for (int k = 0; k < ROI.cols; k++)
{
for (int f = 0; f < ROI.rows; f++)
{
float S = ROI.at<uchar>(f, k);
if (S < thresh)
S = 0;
x += (k * S) / sumval;
y += (f * S) / sumval;
}
}
subApX[i]= x + SubAperture[i][0];
subApY[i]= y + SubAperture[i][1];
fprintf(px, "\n%f", subApX[i]);
fprintf(py, "\n%f", subApY[i]);
}
endTime = clock();
printf("time = %f\n", (double)(endTime - startTime) / CLOCKS_PER_SEC);
return 0;
}
As you see, I must use multi for loops to finish the calculation. However, the whole code will stuck without any errors, only 50% of calculation is finished, the rest of them can't be calculated.
Anyone knows what's the problems I met? and how to speed up my code. The goal of this code is to calculate the center of mass in different ROI ares of one image. The coordinates will be saved as .TXT files and code will calculate the time needed.

Runaway Energies for MD Simulation Using Verlet Algorithm with Periodic Boundary Conditions

I am trying to write a very basic MD simulation code in C++ for a system of 125 particles in a 30x30x30 box with periodic boundary conditions. I have succeeded in initializing the box, but whenever I run the code over multiple time steps the energies blow up and the positions don't fall within the box for certain particles. I am fairly certain it is a problem with the periodic boundary conditions, but I cannot figure out what the problem is. I have attached my code below and would appreciate any advice. Thanks!
/******************************************************************************************************************************************
* Program:
* Create MD simulation for small system of particles
******************************************************************************************************************************************/
//include
#include <fstream>
#include <vector>
#include <math.h>
#include <stdlib.h>
#include <cstring>
#include <iostream>
#include <stdio.h>
#include <unistd.h>
int N; //number of particles
int l; //length of box
double T; //temp
double dt; //time step
double m; //mass
int nsteps; //number of time steps
//properties of lj potential
double sigma;
double rcut;
double ecut;
double ep;
double K; //kinetic energy of system
double Kp; //I have a time delay in the KE calculation so I need to store the value
double energy; //energy
//vector with position data
std::vector<std::vector<double> > r;
//vector with previous position data
std::vector<std::vector<double> > rp;
//vector with velocity data
std::vector<std::vector<double> > v;
//vector with force data
std::vector<std::vector<double> > f;
//vector with acceleration data
std::vector<std::vector<double> > a;
//vector to store position difference between two steps
std::vector<std::vector<double> > dr2t;
void init_pos(int l, int N);
void init_vel(int N, double T, double dt);
void force(int N, double dt);
void accel(double m);
void integrate(double dt);
void vel(double dt);
int main () {
// simulation parameters
N = 125;
l = 30;
T = 0.3;
dt = 0.001;
m = 1;
nsteps = 501;
//properties of lj potential
sigma = 1;
rcut = 3.5*sigma;
ep = 1;
ecut = 4*ep*(pow((sigma/rcut),12.0) - pow((sigma/rcut),6.0));
//initialize the positions and velocities
init_pos(l,N);
init_vel(N,T,dt);
//open file to store positions
FILE * out_pos;
out_pos = fopen("Shea_positions.dat", "w");
//open file to store energy info
FILE * out_en;
out_en = fopen("Shea_energy.dat", "w");
//header in pos file for time step 0
fprintf(out_pos, "%d \n", N);
fprintf(out_pos, "Time Step: 0 \n");
//print positions of each particle
for(int i=0; i<N; i++) {
fprintf(out_pos, "%d %f %f %f\n", i, r[i][0], r[i][1], r[i][2]);
}
//time loop to perform multiple steps
for (int t=1; t<nsteps; t++) {
//header in file for each time step
//fprintf(out_pos, "\n");
fprintf(out_pos, "%d \n", N);
fprintf(out_pos, "Time Step: \t%d \n", t);
Kp = K; //kinetic energy of system at current time step (calculated in next time step after initialization)
//perform calculations at each time step
force(N, dt);
accel(m);
integrate(dt);
vel(dt);
//print positions of each particle
for(int i=0; i<N; i++) {
fprintf(out_pos, "%d %f %f %f\n", i, r[i][0], r[i][1], r[i][2]);
}
//output to energy file
fprintf(out_en, "%d %f %f %f\n", t-1, energy, Kp, energy+Kp);
}
fclose(out_pos);
fclose(out_en);
}
/*********************************************************************************************************************************/
//function to initialize positions
void init_pos(int l, int N) {
int n = (int) cbrt(N); //number of particles in row
int a = 1; //lattice spacing
r.resize(N, std::vector<double>(3, 0));
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
for (int k=0; k<n; k++) {
int num_part = i*n*n+j*n+k;
r[num_part][0] = i*a-n/2*a;
r[num_part][1] = j*a-n/2*a;
r[num_part][2] = k*a-n/2*a;
}
}
}
}
/*********************************************************************************************************************************/
//function to initialize velocities
void init_vel(int N, double T, double dt) {
int RANDMAX=10000;
rp.resize(N, std::vector<double>(3, 0));
v.resize(N, std::vector<double>(3, 0));
std::vector<double> vcom(3, 0);
double sumv2 = 0;
K=0; //initialize kinetic energy at 0
for (int i=0; i<N; i++) {
for (int alpha=0; alpha<3; alpha++) {
v[i][alpha] = ((double)(rand() % RANDMAX) / RANDMAX)-0.5; //assign random velocities from -0.5 to 0.5
vcom[alpha] += v[i][alpha]; //calculate the com velocity
sumv2 += v[i][alpha]*v[i][alpha];
}
}
for (int alpha=0; alpha<3; alpha++) {
vcom[alpha] /= N; //translational velocity of system to remove
}
sumv2 = sumv2/N; //average KE per particle
double sf = sqrt(3*T/(sumv2)); //scaling factor
for (int i=0; i<N; i++) {
for (int alpha=0; alpha<3; alpha++) {
v[i][alpha] -= vcom[alpha]; //correct for com velocity
v[i][alpha] *= sf; //multiply by scaling factor to give the correct temp
rp[i][alpha] = r[i][alpha] - v[i][alpha]*dt; //calculate the previous position
}
K += 0.5*m*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]); //find total kinetic energy of the system
}
}
/*********************************************************************************************************************************/
//calculate force on each particle from the lj potential
void force(int N, double dt) {
f.resize(N, std::vector<double>(3, 0));
energy = 0.;
std::vector<double> dr(3, 0);
double dr2;
for (int i=0; i<N; i++) {
for (int j=0; j<i; j++) {
//calculate distance vector between two particles
dr[0] = r[i][0]-r[j][0];
dr[1] = r[i][1]-r[j][1];
dr[2] = r[i][2]-r[j][2];
//make sure pbc works by taking minimum dist between particles i and j
dr[0] -= l*round(dr[0]/l);
dr[1] -= l*round(dr[1]/l);
dr[2] -= l*round(dr[2]/l);
dr2 = dr[0]*dr[0] + dr[1]*dr[1] + dr[2]*dr[2]; //distance between particles i and j squared
if(dr2<rcut*rcut) {
double invr2 = 1./dr2;
double invr6 = invr2*invr2*invr2;
double ff = 48.* invr2*invr6*(invr6-0.5); //modulus of the force on a particle
//add force to particle i
f[i][0] += ff*dr[0];
f[i][1] += ff*dr[1];
f[i][2] += ff*dr[2];
//add equal and oppostie force to particle j
f[j][0] -= ff*dr[0];
f[j][1] -= ff*dr[1];
f[j][2] -= ff*dr[2];
//calculate the potential energy of the particle
energy += 4*invr6*(invr6-1)-ecut;
}
}
}
}
/*********************************************************************************************************************************/
//calculate the acceleration of the particle from the force
void accel(double m) {
a.resize(N, std::vector<double>(3, 0));
for (int i=0; i<N; i++) {
for (int alpha=0; alpha<3; alpha++) {
a[i][alpha] = f[i][alpha]/m;
}
}
}
/*********************************************************************************************************************************/
//integrate to get the new position of the particle after one time step
void integrate(double dt) {
dr2t.resize(N, std::vector<double>(3, 0));
for (int i=0; i<N; i++) {
for (int alpha=0; alpha<3; alpha++) {
double rr = 2*r[i][alpha] - rp[i][alpha] + dt*dt*a[i][alpha]; //new pos from velocity verlet alg
printf("%f %f %f \n", rr,r[i][alpha],rp[i][alpha]);
//printf("%f \n", a[i][alpha]);
if (rr >= l/2) {
rr -= l;
}
if (rr < -l/2) {
rr += l;
}
dr2t[i][alpha] = rr - rp[i][alpha]; //difference between new position and previous position so you can find velocity
dr2t[i][alpha] -= l*round(dr2t[i][alpha]/l);
rp[i][alpha] = r[i][alpha]; //current position becomes previous position
r[i][alpha] = rr; //new position becomes current positions
}
}
}
/*********************************************************************************************************************************/
//velocity
void vel(double dt) {
K = 0;
for (int i=0; i<N; i++) {
for (int alpha=0; alpha<3; alpha++) {
//printf("%f %f %f \n", dr2t[i][0], dr2t[i][1], dr2t[i][2]);
v[i][alpha] = dr2t[i][alpha]/(2*dt);
}
K += 0.5*m*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
}
}
/*********************************************************************************************************************************/

Cuda Memcpy from Device to Host crashes

I m trying to find a minimum of RGB around the patch size of 15 x 15
In source.cpp file at
SAFE_CALL(cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost));
program get crashed
Here is my code snippet
darkprior.h
#ifndef DARKPRIOR_H_INCLUDED
#define DARKPRIOR_H_INCLUDED
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "opencv2/opencv.hpp"
#define SAFE_CALL(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
void dark_channel(float *image_d, float *rgbmin_d, int height, int width);
#endif
Source.cpp
#include "DarkPrior.h"
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
int main()
{
//load the image
Mat src = imread("foggy_river.jpg");
//check whether image loaded is empty or not.
if (src.empty())
{
cerr << "no image"; return -1;
}
//Mat rgbMin(src.size(), CV_MAKETYPE(src.depth(), 1));
// int step = src.step;
float *image_h = NULL;
float *image_d = NULL;
float *Dark_d = NULL;
float *Dark_h = NULL;
//Mat rgbmin(src.size(), CV_MAKETYPE(src.depth(), 1));
size_t size1 = src.step * src.rows * sizeof(float);
size_t size2 = src.cols * src.rows * sizeof(float);
image_h = (float *)malloc(size1);
Dark_h = (float *)malloc(size1);
SAFE_CALL(cudaMalloc((void**)&image_d, size1));
SAFE_CALL(cudaMalloc((void**)&Dark_d, size2));
//convert image from CV::MAT to float*.
Mat dst;
src.convertTo(dst, CV_32F);
image_h = dst.ptr<float>();
SAFE_CALL(cudaMemcpy(image_d, image_h, size1, cudaMemcpyHostToDevice));
cout << "Calculating Minimum of RGB ..." << endl;
dark_channel(image_d, Dark_d, src.rows, src.cols);
SAFE_CALL(cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost));
Mat Dark_out(src.rows, src.cols, CV_32FC1, Dark_h);
imwrite("MinRGB.jpg", Dark_out);
cudaFree(image_d);
cudaFree(Dark_d);
//free(image_h);
//free(rgbmin_h);
return 0;
}
minRGB.cu
#include "DarkPrior.h"
//#define min(x,y) ((x<y)?x:y)
__device__ float safe_get(float *rgbMin, int width, int height, int x, int y)
{
// Clamp indices to image boundaries
x = min( max(0, x), width - 1);
y = min( max(0, y), height - 1);
// Translate 2D index into 1D index
const int idx = y * width + x ;
return rgbMin[idx];
}
__device__ float estimate_minimum_patch(float *rgbMin, int width, int height, int radius, int x, int y, float Minval)
{
for(int i = -radius; i <= radius; i++)
{
for(int j = -radius; j <= radius; j++)
{
float val = safe_get(rgbMin, width, height, x+i, y+j);
Minval = min (val, Minval);
}
}
}
__global__ void kernel_darkChannel (float *rgbMin, float *darkCh, int height, int width)
{
int radius = 7;
int x = blockIdx.x; // Current column
int y = blockIdx.y; // Current row
int tid = y * width + x;
float Minval = 255.0;
estimate_minimum_patch(rgbMin, width, height, radius, x, y, Minval);
darkCh[tid] = Minval;
}
__global__ void kernel_findMinRGB (float3 *image, float *tmp_min, int height, int width)
{
int x = blockIdx.x; // Current column
int y = blockIdx.y; // Current row
int i = y * width + x;
if(x > height && y > width)
{
return;
}
tmp_min[i] = min(image[i].x, min(image[i].y, image[i].z));
}
void dark_channel(float *image_d, float *Dark_d, int height, int width)
{
dim3 grid(width, height);
float *tmp_min;
cudaMalloc((void **)(&tmp_min), sizeof(float)*height*width);
kernel_findMinRGB <<<grid, 1>>> ((float3 *)image_d, tmp_min, height, width);
printf("RGB min is found\n");
kernel_darkChannel <<<grid, 1>>> (tmp_min, Dark_d, height, width);
printf("patch of minimum is also found\n");
return;
}
My code getting crashed with an error of unknown error # line 45 of source.cpp
I'm totally out of thoughts what is the reason, maybe you'll be able to help.

Pointer Dark_h points to host memory segment of size1 bytes. Pointer Dark_d points to device memory segment of size2 bytes. If size1 < size2 the call:
cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost)
will be troublesome as you'll write illegal memory (memory that's not part of array segment to which Dark_h points to, and perhaps you'll get SEGFAULT). I haven't tried it but I bet this is the reason behind the crash.

How to call existing host function from device function in cuda [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Closed 8 years ago.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
We don’t allow questions seeking recommendations for books, tools, software libraries, and more. You can edit the question so it can be answered with facts and citations.
Improve this question
I have seen a similar question here
However,I could not get an exact answer here, and it is written in 2012.
I am trying to call cublasStatus_t cublasSgbmv(...) function, which is defined in "cublas_v2.h", in a __global__ function. However, I could not use the dynamic parallelism feature. I only have 1 source.cu file. However, I have read that I should compile it in a dynamic way so that it separates device and host functions, then I can link these outputs.
Is there anyone who knows how to do it, or a good source to explain it?
Thanks in advance
edit : if undervoted, please explain the reason at least for me to learn my mistake?
edit2 :
my specific problem is, I'm using the following code in my Source.cu :
#include <iostream>
#include <vector>
#include <cuda.h>
#include <cstdio>
#include <stdio.h>
#include <device_launch_parameters.h>
#include <stdlib.h> //srand(), rand()
#include <time.h>
#include <builtin_types.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define IDX2C(i ,j , ld ) ((( j )*( ld ))+( i ))
#define HEIGHT 4
#define WIDTH 4
#define V 4
#define KL 2
#define KU 1
#define THREADS_PER_BLOCK 512
#pragma comment(lib, "cublas")
//#pragma comment(lib, "helper_cuda")
using namespace std;
void create_Matrix(int* matrix, int width, int height){
int i, len;
len = height * width;
srand(time(NULL));
for (i = 0; i < len; i++){
matrix[i] = rand() % 10 + 1; //generates number between 1-10
}
}
template <typename T>
void print_vector(T* vector, int len){
for (int i = 0; i < len; i++)
cout << vector[i] << " ";
cout << endl;
}
template <typename T>
void creating_bandedMatrix(T* bandedMatrix, int height, int width, int ku, int kl){
//fill matrix with zeros at the beginning
int i, len;
len = height * width;
for (i = 0; i < len; i++){
bandedMatrix[i] = 0; //generates number between 1-10
}
srand(time(NULL));
//filling banded diagonal
int start, end;
for (int i = 0; i < height; i++){
start = i - kl;
if (start < 0)
start = 0;
end = i + ku + 1;
if (end > width)
end = width;
for (int j = start; j < end; j++){
*(bandedMatrix + (i*width) + j) = (float)(rand() % (10) + 1); //rand() / (T)RAND_MAX;;
}
}
}
template <typename T>
void print_matrix(T* matrix, int width, int height){
int len = width*height;
cout << "asdsffffff" << endl;
for (int i = 0; i < len; i++){
if (!(i%width))
cout << endl;
cout << i << ":" <<matrix[i] << " ";
}
cout << endl;
}
template <typename T>
void computeMatrixVectorMultiplication(T* bandedMatrix, T* vector2){
T row_sum = 0;
T* bandedHostResult = (T*)malloc(WIDTH * sizeof(T));
for (int i = 0; i < HEIGHT; i++){
row_sum = 0;
for (int j = 0; j < WIDTH; j++){
row_sum += (*(bandedMatrix + i*WIDTH + j)) * vector2[j];
}
bandedHostResult[i] = row_sum;
}
//priting the result
cout << "\n\nBanded Host Result...\n";
print_vector(bandedHostResult, WIDTH);
}
template <typename T>
void fillLapackMatrix(T* lapack_matrix, T* bandedMatrix, int kl, int ku, int banded_w, int banded_h, int lapack_w, int lapack_h){
int i, j, lapack_i;
int len = lapack_h * lapack_w;
for (i = 0; i < len; i++){
lapack_matrix[i] = 0; //generates number between 1-10
}
for (i = 0; i < banded_w; i++){
for (j = 0; j < banded_h; j++){
lapack_i = ku + i - j;
*(lapack_matrix + lapack_i*lapack_w + j) = *(bandedMatrix + i*banded_w + j);
//lapack_matrix[lapack_i*lapack_w + j] = bandedMatrix[i*bandedMatrix + j];
}
}
}
__global__ void device_cublasSgbmv(int m,int n,int kl, int ku,float* alpha, float* A, int lda ,float* B,int ldb,float*R, int ldr, float* beta){
int index = blockIdx.x * blockDim.x + threadIdx.x;
cublasHandle_t handle;
cublasCreate(&handle);
cublasOperation_t trans = CUBLAS_OP_N;
float* dev_x;
cudaMalloc((void**)&dev_x,sizeof(float) * n);
if(index < ldr){
cublasSgbmv(handle, trans,m, n, kl, ku, alpha, A, m, B+index*n, 1, beta, R+index*n, 1);
index = 0;
}
}
void fillNormalMatrix(float* B,int h,int w){
for(int i = 0; i < h;i++){
for(int j = 0; j < w;j++){
B[i*w + j] = 1;
}
}
}
int main()
{
cublasStatus_t status;
float *A;
float *x, *y;
float *dev_x, *dev_y;
int incx, incy;
float *dev_A = 0;
float alpha = 1.0f;
float beta = 0.0f;
int matrixSize = WIDTH * HEIGHT;
int i, j;
cublasHandle_t handle;
/* Initialize CUBLAS */
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
//Allocate host memory for the matrices
A = (float *)malloc(matrixSize* sizeof(float));
//Allocate memory for host vectors
x = (float *)malloc(WIDTH * sizeof(float));
y = (float*)malloc(WIDTH * sizeof(float));
// Fill the matrices with test data
creating_bandedMatrix(A, WIDTH, HEIGHT, KU, KL);
cout << "Banded Matrix\n";
print_matrix(A, WIDTH, HEIGHT);
//Fill the vectors with random data
for (i = 0; i < WIDTH; i++){
x[i] = 1;// (float)(rand() % (10) + 1);:
y[i] = (float)(rand() % (10) + 1);
}
cout << "\nvector x...\n";
print_vector(x, WIDTH);
//cout << "\nvector y...\n";
//print_vector(y, WIDTH);
//Allocate device memory for the matrix
if (cudaMalloc((void **)&dev_A, matrixSize * sizeof(float)) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
//Allocate device memory for vectors
if (cudaMalloc((void**)&dev_x, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&dev_y, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
// Initialize the device vectors with the host vectors
status = cublasSetVector(WIDTH, sizeof(float), x, 1, dev_x, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write x vector)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(WIDTH, sizeof(float), y, 1, dev_y, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write y vector)\n");
return EXIT_FAILURE;
}
//initialize matrix with lapack format
int lapack_width = WIDTH > HEIGHT ? HEIGHT : WIDTH;
int lapack_height = KL + KU + 1;
int lapackSize = lapack_height * lapack_width;
float* lapack_matrix = (float*)malloc(lapackSize * sizeof(float));
fillLapackMatrix(lapack_matrix, A, KL, KU, WIDTH, HEIGHT, lapack_width, lapack_height);
cout << "\n\nLAPACK MAtrix\n";
print_matrix(lapack_matrix, lapack_width, lapack_height);
//convert to column column matrix
float* col = (float*)malloc(lapackSize * sizeof(float));
for (i = 0; i < WIDTH; i++){
for (j = 0; j < HEIGHT; j++){
col[i + WIDTH*j] = lapack_matrix[WIDTH*i + j];
}
}
cout << "Lapack Column Based Matrix\n";
print_matrix(col,HEIGHT-1,WIDTH);
//status = cublasSetVector(lapackSize, sizeof(float), A, 1, dev_A, 1);
cublasSetMatrix(HEIGHT, WIDTH, sizeof(float), col, HEIGHT, dev_A, HEIGHT);
cublasOperation_t trans = CUBLAS_OP_N;
incy = incx = 1;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////// Banded Matrix Matrix Multipllicatio ///////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
float* B,*dev_B,*dev_R,*R;
B = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
R = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
fillNormalMatrix(B,WIDTH,HEIGHT);
cudaMalloc((void**)&dev_B,matrixSize*sizeof(*B));
cudaMalloc((void**)&dev_R,matrixSize*sizeof(*R));
cublasSetMatrix(HEIGHT, WIDTH, sizeof(*B), B, HEIGHT, dev_B, HEIGHT);
cout << "Matrix B\n";
print_matrix(B,HEIGHT,WIDTH);
cout << "gfsdf\n";
device_cublasSgbmv<<<1,4>>>(HEIGHT, WIDTH, KL, KU, &alpha, dev_A, WIDTH, dev_B, HEIGHT, dev_R, HEIGHT,&beta);
cout << "after\n";
cublasGetMatrix(HEIGHT,WIDTH, sizeof (*R) ,dev_R ,WIDTH,R,WIDTH);
getchar();
return 0;
}
and compile it like :
nvcc -gencode=arch=compute_35,code=sm_35 -lcublas -lcudadevrt -O3 Source.cu -o Source.o -dc
g++ Source.o -lcublas -lcudart
then, I get the following :
In function `__sti____cudaRegisterAll_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a()':
tmpxft_00001f1e_00000000-3_lapack_vector.cudafe1.cpp:(.text.startup+0x575): undefined reference to `__cudaRegisterLinkedBinary_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a'
collect2: error: ld returned 1 exit status

You can compile and link the code you have now shown with a single command like this:
nvcc -arch=sm_35 -rdc=true -lcublas -lcublas_device -lcudadevrt -o test Source.cu
You may get some warnings like this:
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sgemm.asm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_sgemm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_ssyrk.o'
Those can be safely ignored.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

visual studio 2012 parallelism with openmp - c++

Related

FFTW returns different results in some loops

How to speed up my code by openmp with multi for loops

Runaway Energies for MD Simulation Using Verlet Algorithm with Periodic Boundary Conditions

Cuda Memcpy from Device to Host crashes

How to call existing host function from device function in cuda [closed]

Categories

Resources