C/C++: circular shift on matrices - c++

I am trying to write an efficient code to perform circular shift which I need to implement it on multiple times on big matrices during my data processing.
On my first trial, compiler throws some exception and it seems that I may be trying to access matrix element outside its size and I have no idea what is going on wrong.
1) I am also using Armadillo lib which has "mat" definition.
2) I intend to shift it by row and/ or column.
Here is my try:
#include "stdafx.h"
#include <vector>
#include <iostream>
#include "C:\Users\kumar\Documents\Visual Studio 2012\UserLibs\armadillo-3-910-0\include\armadillo"
#include <stdlib.h> /* srand, rand */
using namespace arma;
template<class ty>
void circshift(ty *out, const ty *in, int xdim, int ydim, int xshift, int yshift)
{
int iOutputInd, iInputInd, ii, jj;
for (int i =0; i < xdim; i++)
{
ii = (i + xshift) % xdim;
for (int j = 0; j < ydim; j++)
{
jj = (j + yshift) % ydim;
iOutputInd = ii * ydim + jj;
iInputInd = i * ydim + j;
std::cout << " iOutputInd --> " << iOutputInd << " ; iInputInd -->" << iInputInd << "\n";
out[iOutputInd] = in[iInputInd]; // EXCEPTION BEING THROWN HERE
}
}
}
int _tmain(int argc, _TCHAR* argv[])
{
//a = [1 2 3; 4 5 6; 7 8 9];
mat a, a_out; // "mat" defined in C++ lib Armadillo
a << 1 << 2 << 3 << endr
<< 4 << 5 << 6 << endr
<< 7 << 8 << 9 <<endr;
a.reshape(3,3);
//a.print();
a_out = a;
int xdim = 3; int ydim = 3; int xshift = 1; int yshift = 0;
circshift(&a_out, &a, xdim, ydim, xshift, yshift);
a_out.print();
return 0;
}
It compiles fine. However, when I try to run, Visual studio throws following error:
Unhandled exception at 0x3FF00000 in Circshift_Example.exe: 0xC0000005: Access violation (parameters: 0x00000008).
I get another error in visual studio console, which complains:
error: Mat::init(): requested size is too large
Update: FINAL SOLUTION
I am posting my code as it may be useful for some users.
Please note that I am using "Armadillo" library to create matrix. One can replace Armadillo "mat" class wwith their own matrix class.
Please up-vote if you use this code.
#include "stdafx.h"
#include "armadillo-3-910-0\include\armadillo"
using namespace arma;
template<class ty>
void circshift(ty& out, const ty& in, int xshift, int yshift)
{
int iOutputInd, iInputInd, ii, jj;
int ydim = in.n_cols;
int xdim = in.n_rows;
for (int j =0; j < ydim; j++)
{
jj = (j + yshift) % ydim;
if (jj <0) jj = jj + ydim;
for (int i = 0; i < xdim; i++)
{
ii = (i + xshift) % xdim;
if (ii <0) ii = ii + xdim;
out[jj * xdim + ii] = in[j * xdim + i];
}
}
}
int _tmain(int argc, _TCHAR* argv[])
{
//a = [1 2 3; 4 5 6; 7 8 9];
mat a, a_out;
a << 1 << 2 << 3 << endr
<< 4 << 5 << 6 << endr
<< 7 << 8 << 9 <<endr;
a.reshape(3,3);
a_out = a;
int xshift = 1; int yshift = 0;
circshift(a_out, a, xshift, yshift);
a_out.print();
xshift = 1; yshift = -1;
circshift(a_out, a, xshift, yshift);
a_out.print();
return 0;
}

The main error here is that you pass pointers to mat type objects to the circshift() function (the out and in argument, but then use these arguments as arrays to mat. The following line is not interpreted as you think
out[iOutputInd] = in[iInputInd];
because out and in are not mat objects. They are pointers to mat objects, so the compiler will interpret in and out as being pointer to arrays of mat and index these arrays, copying a non-existant mat from in[...] to another non-existant location.
One simple way to fix that is to use references instead of pointers to pass the mat objects, i.e.:
template<class ty> void circshift(ty& out, const ty& in, int xdim, int ydim, int xshift, int yshift)
{
...
}
and call it in _tmain using:
circshift(a_out, a, xdim, ydim, xshift, yshift);

Related

FFTW returns different results in some loops

I'm new to use C++.
I have tried to implement FFT using Eigen and fftw3 (version 3.3.10).
The purpose is to read measurement data from CSV file and analyze FFT.
However, I realized that fftw returns different results some time (sometimes return results correctly, sometimes wrong results...) in some loops if do some analysis again and again.
The problem might be just a memory leaking or casting variable problem as I'm new to use C++.
I'm very appreciated if you give me any advice or tips.
Thanks in advance!
#define EIGEN_FFTW_DEFAULT
#include <iostream>
#include <string>
#include <vector>
#include <cmath>
#include <fstream>
#include <sstream>
#include "Eigen/Dense"
#include <fftw3.h>
using namespace Eigen;
// define functions
template <typename T>
T readCSV(const std::string &path);
int nextpow2(int n);
VectorXd offsetData(VectorXd v);
void fftw_test(VectorXd x);
// calculate exponent of next higher power of 2
int nextpow2(int n)
{
if (n < 0) // n must be int
return 0;
if (n == 1) // n
return 1;
return (int)floor(log2(n - 1)) + 1.0;
};
// Read Measurement Data
template <typename T>
T readCSV(const std::string &path)
{
// https://stackoverflow.com/questions/34247057/how-to-read-csv-file-and-assign-to-eigen-matrix
std::ifstream file;
std::string line;
std::string cell;
std::vector<double> row;
uint rows = 0;
file.open(path);
std::cout << "Opend file: " << path << std::endl;
std::getline(file, line); // skip the first header line
while (std::getline(file, line))
{
std::stringstream lineStream(line);
while (std::getline(lineStream, cell, ','))
{
row.push_back(std::stod(cell)); // insert value as double
}
++rows;
}
return Map<const Matrix<typename T::Scalar, T ::RowsAtCompileTime, T::ColsAtCompileTime, RowMajor> >(row.data(), rows, row.size() / rows);
};
void fftw_test(VectorXd x)
{
// Convert data unit
x = x * 980.665 * 10; // Unit conversion:[G] to [cm/sec^2] to [mm/sec^2]
int ns = x.size(); // number of samples
int nfft = std::pow(2, nextpow2(ns)); // number of fft
// Zero padding to array
VectorXd xpad;
int npad = nfft - ns;
if (npad > 0)
{
xpad = VectorXd(nfft);
for (int i = 0; i < ns; ++i)
{
xpad(i) = x(i);
}
}
else
{
xpad = x;
}
// /* prepare a cosine wave */
// for (i = 0; i < N; i++)
// {
// in[i][0] = cos(3 * 2 * M_PI * i / N);
// in[i][1] = 0;
// }
int N = nfft;
fftw_complex *in, *out, *in2;
in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
in2 = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan p, q;
for (int i = 0; i < N; i++)
{
in[i][0] = (double)xpad(i);
in[i][1] = 0;
}
p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT);
fftw_execute(p);
for (int i = 0; i < 10; i++)
{
printf("in: %3d %+9.5f %+9.5f I\n", i, in[i][0], in[i][1]);
}
for (int i = 0; i < 10; i++)
{
printf("freq: %3d %+9.5f %+9.5f I\n", i, out[i][0], out[i][1]);
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
fftw_free(in2);
fftw_cleanup();
};
VectorXd offsetData(VectorXd v)
{
// Offset by mean values
int ns = v.size(); // number of samples
VectorXd ones = MatrixXd::Ones(ns, 1);
v = v - v.mean() * ones;
return v;
};
int main()
{
// Read measured data from csv file
MatrixXd measuredData = readCSV<MatrixXd>("./sampleCsv/20220208-134655_A351AU.csv");
// Extract a vertical acceleration column
VectorXd Acc = measuredData.col(4);
VectorXd Acc_offset = offsetData(Acc / 1000);
for (int i = 0; i < 100; ++i)
{
// fftw bug test
printf("loop: %ith \n", i);
fftw_test(Acc_offset);
};
return 0;
}
The sample CSV file is here.
https://drive.google.com/file/d/1DQO2eeMX7AfxjnuW8DDJMOitxHNuIDHA/view?usp=sharing
The correct results should be below.
freq: 0 -0.00000 +0.00000 I
freq: 1 +320.64441 -83.56961 I
freq: 2 -113.66004 -195.80680 I
freq: 3 -28.57778 -13.57046 I
freq: 4 -47.71908 +185.43538 I
freq: 5 +381.01770 +92.18739 I
freq: 6 +430.73267 -348.16464 I
freq: 7 -111.55714 -796.10333 I
freq: 8 -810.79331 -273.42916 I
freq: 9 -624.83461 +607.38775 I

OpenCV Assertion failed ((unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels())) in cv::Mat::at

I'm trying to do the smoothing of an RGB image using OpenCV. I'm using spatial correlation formula:
Here's the code:
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <iostream>
using namespace cv;
using namespace std;
int main(int argc, char** argv) {
if (argc < 2) {
cout << "usage: " << argv[0] << " image_name" << endl;
exit(0);
}
String imageName = argv[1];
Mat image;
image = imread(imageName, IMREAD_COLOR);
if (image.empty()) {
cout << "Could not open or find the image" << std::endl;
return -1;
}
int padding = 2;
int padding2 = padding / 2;
Mat copy = Mat::zeros(image.rows + padding, image.cols + padding, image.type());
image.copyTo(copy(Rect(padding2, padding2, image.cols, image.rows)));
//image.copyTo(copy(Range(padding2,image.rows+padding2),Range(padding2,image.cols+padding2)));
cout << "rows: " << image.rows << " cols: " << image.cols << endl;
for (int i = 0; i < image.rows + 1; i++) {
for (int j = 0; j < image.cols + 1; j++) {
int sumB = 0;
int sumG = 0;
int sumR = 0;
for (int r = -1; r <= 1; r++) {
for (int c = -1; c <= 1; c++) {
sumB += image.at<Vec3b>(i + 1 + r, j + 1 + c)[0];
sumG += image.at<Vec3b>(i + 1 + r, j + 1 + c)[1];
sumR += image.at<Vec3b>(i + 1 + r, j + 1 + c)[2];
}
}
cout << "i: " << i << " j: " << j << endl;
copy.at<Vec3b>(i + 1, j + 1)[0] = sumB / 9;
copy.at<Vec3b>(i + 1, j + 1)[1] = sumG / 9;
copy.at<Vec3b>(i + 1, j + 1)[2] = sumR / 9;
}
}
imshow("Original", image);
imshow("Copy", copy);
waitKey(0);
return 0;
}
When I'm trying to execute it, I get this error:
OpenCV(3.4.9) Error: Assertion failed ((unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels())) in cv::Mat::at, file C:\opencv\build\include\opencv2\core\mat.inl.hpp, line 1179
I'm using OpenCV 3.4.9 with Visual Studio 2022 in Windows 11.
The cause for the assert:
Trying to access a cv::Mat out of its bound.
In these lines:
sumB += image.at<Vec3b>(i + 1 + r, j + 1 + c)[0];
sumG += image.at<Vec3b>(i + 1 + r, j + 1 + c)[1];
sumR += image.at<Vec3b>(i + 1 + r, j + 1 + c)[2];
i can be up to image.rows, and j can be up to image.cols (see the for loops above). Then r and c can be up to 1. It means the .at method will be called with out-of-bound indices.
You could change your for loops to do 2 less iterations.
But another related issue is that in each (i,j) iteration you actually handle the ((i+1),(j+1)) pixel which is a bit missleading.
I advise to change it to be more straightforward (see the code below).
Another issue is that calling .at is quite expensive (due to its implementation containing a lot of checks - one of them caused the assert). I reduced the number of calls by a factor of 3 by getting a reference to the pixel once, and then using this reference to access the 3 channels.
A few more comments:
Better to avoid using namespace std - see here Why is "using namespace std;" considered bad practice?
I also believe using namespace cv should be avoided for similar (even if less strong) reasons.
This code can be further optimized, by using direct access to cv::Mat data using pointers. But I think this optimization can be subject of another stackoverflow entry.
Here is the code:
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <iostream>
int main(int argc, char** argv) {
if (argc < 2) {
std::cout << "usage: " << argv[0] << " image_name" << std::endl;
exit(0);
}
cv::String imageName = argv[1];
cv::Mat image = cv::imread(imageName, cv::IMREAD_COLOR);
if (image.empty()) {
std::cout << "Could not open or find the image" << std::endl;
return -1;
}
int padding = 2;
int padding2 = padding / 2;
cv::Mat copy = cv::Mat::zeros(image.rows + padding, image.cols + padding, image.type());
image.copyTo(copy(cv::Rect(padding2, padding2, image.cols, image.rows)));
std::cout << "rows: " << image.rows << " cols: " << image.cols << std::endl;
for (int i = 1; i < image.rows - 1; i++) {
for (int j = 1; j < image.cols - 1; j++) {
int sumB = 0;
int sumG = 0;
int sumR = 0;
for (int r = -1; r <= 1; r++) {
for (int c = -1; c <= 1; c++) {
auto const & srcPixel = image.at<cv::Vec3b>(i + r, j + c);
sumB += srcPixel[0];
sumG += srcPixel[1];
sumR += srcPixel[2];
}
}
//cout << "i: " << i << " j: " << j << endl;
auto & dstPixel = copy.at<cv::Vec3b>(i, j);
dstPixel[0] = sumB / 9;
dstPixel[1] = sumG / 9;
dstPixel[2] = sumR / 9;
}
}
cv::imshow("Original", image);
cv::imshow("Copy", copy);
cv::waitKey(0);
return 0;
}

Error Eigen library on linux

I have implemented this code with Eigen library to have Triplet structure.
This code works very well in my project on my Mac OS X. However the same code don't work on Linux platform.
Eigen::SparseMatrix<double> spdiags(const MatrixXd& B, const
Eigen::Matrix<int, 1,1>& d, size_t m, size_t n)
{
Eigen::SparseMatrix<double> A(m,n);
typedef Eigen::Triplet<double> T;
std::vector<T> triplets;
triplets.reserve(std::min(m,n)*d.size());
for (int k = 0; k < d.size(); k++)
{
int i_min = std::max(0, -d(k));
int i_max = std::min(m - 1, n - d(k) - 1);
int B_idx_start = m >= n ? d(k) : 0;
for (int i = i_min; i <= i_max; i++) {
triplets.push_back( T(i, i+k, B(B_idx_start + i, k)) );
}
A.setFromTriplets(triplets.begin(), triplets.end());
std::cout << "Row\tCol\tVal" <<std::endl;
for (int k=0; k < A.outerSize(); ++k)
{
for (SparseMatrix<double>::InnerIterator it(A,k); it; ++it)
{
std::cout << it.row() << "\t"; // row index
std::cout << it.col() << "\t";
std::cout << it.value() << std::endl;
}
}
return A;
}
I have this error only on Linux (there is no error on Mac). The code source of the file DenseCoeffsBase.h is the same:
"/usr/local/include/Eigen/src/Core/DenseCoeffsBase.h:114:
Eigen::DenseCoeffsBase<Derived, 0>::CoeffReturnType
Eigen::DenseCoeffsBase<Derived, 0>::operator()
(Eigen::DenseCoeffsBase<Derived, 0>::Index,
Eigen::DenseCoeffsBase<Derived, 0>::Index) const
[with Derived = Eigen::Matrix<double, -1, -1>;
Eigen::DenseCoeffsBase<Derived, 0>::CoeffReturnType = const double&;
Eigen::DenseCoeffsBase<Derived, 0>::Index = long int]:
Assertion `row >= 0 && row < rows() && col >= 0 && col < cols()' failed."
Any ideas?
Here is an MVC as asked :
#include<Eigen/Sparse>
#include <Eigen/Sparse>
#include<Eigen/Dense>
#include<Eigen/Eigenvalues>
Matrix<int, 1, 1> d1; d1(0)=0;
MatrixXd d0; d0.resize(1,5);
d0(0)=10;d0(1)=20;d0(2)=30;d0(3)=30;d0(4)=40;d0(5)=50;
Eigen::SparseMatrix<double> Diag_laplacian=test.spdiags(d0,d1,5,5);
//--------------
//the result must be like this :
Row Col Val
0 0 10
1 1 20
2 2 30
3 3 30
4 4 40
This, my dear sir/madam, is an MCVE
#include <iostream>
#include <Eigen/Core>
#include <Eigen/Sparse>
using namespace Eigen;
Eigen::SparseMatrix<double> spdiags(const MatrixXd& B,
const Eigen::Matrix<int, 1, 1>& d, size_t m, size_t n)
{
Eigen::SparseMatrix<double> A(m, n);
typedef Eigen::Triplet<double> T;
std::vector<T> triplets;
triplets.reserve(std::min(m, n)*d.size());
for (int k = 0; k < d.size(); k++)
{
int i_min = std::max(0, -d(k));
int i_max = std::min(m - 1, n - d(k) - 1);
int B_idx_start = m >= n ? d(k) : 0;
for (int i = i_min; i <= i_max; i++)
triplets.push_back(T(i, i + k, B(B_idx_start + i, k)));
}
A.setFromTriplets(triplets.begin(), triplets.end());
std::cout << "Row\tCol\tVal" << std::endl;
for (int k = 0; k < A.outerSize(); ++k)
{
for (SparseMatrix<double>::InnerIterator it(A, k); it; ++it)
{
std::cout << it.row() << "\t"; // row index
std::cout << it.col() << "\t";
std::cout << it.value() << std::endl;
}
}
return A;
}
int main()
{
Matrix<int, 1, 1> d1; d1(0) = 0;
MatrixXd d0; d0.resize(1, 5);
// Note that you *have* to use (x,y) indices on a MatrixXd
// Otherwise, you get a different assertion failure
d0(0,0) = 10; d0(0,1) = 20;
d0(0,2) = 30; d0(0,3) = 30;
d0(0,4) = 40;
// d0(0,5) = 50; // OUT OF BOUNDS!!!
Eigen::SparseMatrix<double> Diag_laplacian = spdiags(d0, d1, 5, 5);
}
The expected result is (as you stated):
Row Col Val
0 0 10
1 1 20
2 2 30
3 3 30
4 4 40
To reproduce the results, I can either use VS (2013 in my case) or g++ (i.e. it's not Linux vs. Mac). As you are using g++, I will as well.
To reproduce the behavior you described on the Linux build, I compiled with
g++ -O3 -I"C:\usr\include" Source.cpp -o a.exe
Running a.exe gave me (as you stated)
Assertion failed: row >= 0 && row < rows() && col >= 0 && col < cols(), file C:\usr\include/Eigen/src/Core/DenseCoeffsBase.h, line 114
Debugging it showed me that it fails on the line
triplets.push_back(T(i, i + k, B(B_idx_start + i, k)));
when i == 1. Why? Exactly as #marc and I stated. B is not shaped/sized as you use it. Changing B(B_idx_start + i, k) with B(k, B_idx_start + i) resolves the issue.
Now, why does it work on the Mac? The answer has to do with the error itself. It's an assertion error. Assertions are not checked when NDEBUG is defined. So you probably compiled using something like
g++ -DNDEBUG -O3 -I"C:\usr\include" Source.cpp -o a.exe
on the Mac, and it ran fine, as then the assertions are ignored:
#ifdef NDEBUG
#define assert(_Expression) ((void)0)
#else
So, if there is an assertion failure, why does it work when we define NDEBUG? The answer to that is that the data pointer points to the first of five allocated doubles. Using the correct indexing, we should get index = k*1 + (B_idx_start + i), and since in this case k==0 and B_idx_start==0, we get index=i. This is within the bounds and therefore we don't get an out of bounds exception. Using the incorrect indexing, we get index = (B_idx_start + i)*1 + k which again, results in index=i. If the size of the matrix was (for example) 2x5, then we would have gotten an out of bounds exception.

MPI C++ class object use member function on slave processors

I am writing a code to send fixed number of rows (split along first dimension in case of 2D) and a slab (for 3D, again split along the first dimension). For example for a 100x100 I would like to send 25x100 to each of the 4 processors and in case of 3D (100x100x100) I would like to send 25x100x100 to each processors.
I have a class file and I create instance of class in the main program. My question is how 1) How can I create class objects for each process (mpi rank) and be able to use the member functions (example my_average) in the code. 2) If I create instance of class inside enclosing parenthesis the I get scope issues. 3) In the output attached below I see that send is not working. What am I missing? Is this the right way to do things.
Please help.
Pardon my lingo - c++ is not my first language.
Attached sample output.
each process gets 2
First Dimension 4
Number of processors 2
Displacement and blocklen 0 8
Displacement and blocklen 8 8
Displacement and blocklen 0 8
Displacement and blocklen 8 8
( 0.0000 + i -0.0000)( 1.0000 + i -1.1315)( 2.0000 + i -2.7556)( 3.0000 + i -3.4587)
( 0.5328 + i 0.4672)( 1.2190 + i -0.2190)( 2.0470 + i -1.0470)( 3.6789 + i -2.6789)
( 1.3586 + i 1.3207)( 2.8694 + i 0.0653)( 2.7670 + i -0.3835)( 4.0388 + i -1.5194)
( 2.4929 + i 2.1690)( 1.1037 + i 1.9654)( 2.1604 + i 0.9465)( 4.5891 + i -0.5297)
end of 2D matrix
rank=0 Rows : 2 Width : 4 local_n0 size 2
( 0.0000 + i -0.0000)( 1.0000 + i -1.1315)( 2.0000 + i -2.7556)( 3.0000 + i -3.4587)
( 0.5328 + i 0.4672)( 1.2190 + i -0.2190)( 2.0470 + i -1.0470)( 3.6789 + i -2.6789)
rank=1 Rows : 2 Width : 4 local_n0 size 2
( 0.0000 + i 0.0000)( 0.0000 + i 0.0000)( 0.0000 + i 0.0000)( 0.0000 + i 0.0000)
( 0.0000 + i 0.0000)( 0.0000 + i 0.0000)( 0.0000 + i 0.0000)( 0.0000 + i 0.0000)
main program
#include <fftw3-mpi.h>
#include <cstdlib>
#include <iostream>
#include <iomanip> // needed for setfill
#include "cmplx_2d.hh"
#include "random.hh"
using namespace std;
int main(int argc, char **argv){
const int N0 = 4, N1 = 4;
int i,j;
int rank,size;
double rx;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int local_n0=N0/size;
if(rank==0){
cout << " each process gets " << local_n0 << endl;
cout << " First Dimension " << N0 << endl;
cout << " Number of processors " << size << endl;
}
/* compute start and size of rows */
int n_rows[size];
int send_counts[size];
int displs[size];
displs[0]=0;
for (i = 0; i < size; ++i) {
n_rows[i]=N0/size;
if(i<N0%size){
n_rows[i]=n_rows[i]+1;
}
send_counts[i]=n_rows[i]*N1;
if(i>0){
displs[i]=displs[i-1]+send_counts[i-1];
}
cout << " Displacement and blocklen " << displs[i] << " " << send_counts[i] << endl;
}
int rows=n_rows[rank];
/* create 2d complex objects */
cmplx_2d arr_cmplx(N0,N1);
if(rank==0) {
/* initialize data -- random number */
for (i = 0; i < N0; ++i) {
for (j = 0; j < N1; ++j){
rx =((double)rand()/(double)RAND_MAX);
arr_cmplx(i,j)=complex<double>(rx*i+j,i-j-rx);
printf("(%8.4f + i%8.4f)", real(arr_cmplx(i,j)), imag(arr_cmplx(i,j)));
}
printf(" \n ");
}
printf(" \n end of 2D matrix \n \n ");
} // end of master
if(rank==0) {
MPI_Scatterv(&arr_cmplx(0,0),send_counts,displs,MPI_DOUBLE_COMPLEX,MPI_IN_PLACE,0,MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
} else {
/* I should allocate local array on each processor to receive data but if I do it here I get scope error */
//cmplx_2d arr_cmplx(rows,N1);
MPI_Scatterv(NULL,send_counts,displs,MPI_DOUBLE_COMPLEX,&arr_cmplx(0,0),rows*N1,MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
}
//printing, one proc at a time
if(rank>0){
MPI_Status status;
MPI_Recv(NULL,0,MPI_DOUBLE_COMPLEX,rank-1,0,MPI_COMM_WORLD,&status);
}
cout<<"rank="<< rank<<" Rows : "<<rows<<" Width : "<<N1<< " local_n0 size " << local_n0 << endl;
for(i=0; i<rows; i++)
{
for(j=0; j<N1; j++)
printf("(%8.4f + i%8.4f)", real(arr_cmplx(i,j)), imag(arr_cmplx(i,j)));
cout<<endl;
}
if(rank<size-1){
MPI_Send(NULL,0,MPI_DOUBLE_COMPLEX,rank+1,0,MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
header file
#include <cassert> // for assert()
#include <complex>
using namespace std;
class cmplx_2d
{
private:
int nx;
int ny;
complex<double>** array_2d;
public:
// constructor
cmplx_2d(int x, int y) {
array_2d = new complex<double>*[x];
for(int i(0); i < x; ++i) {
array_2d[i] = new complex<double>[y];
for(int j(0); j < y; ++j) {
array_2d[i][j]= complex<double>(0.,0.);
}}
nx=x;
ny=y;
}
// deconstructor
~cmplx_2d() {
delete [] array_2d;
}
// overload () operator
complex<double>& operator()(const int i, const int j) const;
// prototype of overload constructor - just a dummy function
void my_avg( const cmplx_2d& in, int* ir, int* il, int* jr, int* jl, const int flag );
int getNx() const {
return nx;
}
int getNy() const {
return ny;
}
}; // end of class
header file
#include <iostream>
#include "cmplx_2d.hh"
#include "main_header.hh"
// overload constructor to - just a dummy for now
void cmplx_2d::my_avg( const cmplx_2d& in, int* ir, int* il, int* jr, int* jl, const int flag ) {
nx=in.getNx();
ny=in.getNy();
if ( flag == DX )
{
// derivative with respect to x
for(int i(0); i < nx; ++i) {
for(int j(0); j < ny; ++j) {
array_2d[i][j] = 0.5*(in(ir[i],j)+in(il[i],j));
}}
}
else
{
std::cout << "NOT a valid flag for derivative!" << std::endl;
}
} // end
// overloading of () parenthesis operator
complex<double>& cmplx_2d::operator()(const int i, const int j) const
{
assert(i >= 0 && i < nx);
assert(j >= 0 && j < ny);
return array_2d[i][j];
}
main header
const double PI=4.0*atan(1.0);
const double TWO_PI=2.0*PI;
enum flag {DX, DY, LAP};

How to call existing host function from device function in cuda [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Closed 8 years ago.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
We don’t allow questions seeking recommendations for books, tools, software libraries, and more. You can edit the question so it can be answered with facts and citations.
Improve this question
I have seen a similar question here
However,I could not get an exact answer here, and it is written in 2012.
I am trying to call cublasStatus_t cublasSgbmv(...) function, which is defined in "cublas_v2.h", in a __global__ function. However, I could not use the dynamic parallelism feature. I only have 1 source.cu file. However, I have read that I should compile it in a dynamic way so that it separates device and host functions, then I can link these outputs.
Is there anyone who knows how to do it, or a good source to explain it?
Thanks in advance
edit : if undervoted, please explain the reason at least for me to learn my mistake?
edit2 :
my specific problem is, I'm using the following code in my Source.cu :
#include <iostream>
#include <vector>
#include <cuda.h>
#include <cstdio>
#include <stdio.h>
#include <device_launch_parameters.h>
#include <stdlib.h> //srand(), rand()
#include <time.h>
#include <builtin_types.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define IDX2C(i ,j , ld ) ((( j )*( ld ))+( i ))
#define HEIGHT 4
#define WIDTH 4
#define V 4
#define KL 2
#define KU 1
#define THREADS_PER_BLOCK 512
#pragma comment(lib, "cublas")
//#pragma comment(lib, "helper_cuda")
using namespace std;
void create_Matrix(int* matrix, int width, int height){
int i, len;
len = height * width;
srand(time(NULL));
for (i = 0; i < len; i++){
matrix[i] = rand() % 10 + 1; //generates number between 1-10
}
}
template <typename T>
void print_vector(T* vector, int len){
for (int i = 0; i < len; i++)
cout << vector[i] << " ";
cout << endl;
}
template <typename T>
void creating_bandedMatrix(T* bandedMatrix, int height, int width, int ku, int kl){
//fill matrix with zeros at the beginning
int i, len;
len = height * width;
for (i = 0; i < len; i++){
bandedMatrix[i] = 0; //generates number between 1-10
}
srand(time(NULL));
//filling banded diagonal
int start, end;
for (int i = 0; i < height; i++){
start = i - kl;
if (start < 0)
start = 0;
end = i + ku + 1;
if (end > width)
end = width;
for (int j = start; j < end; j++){
*(bandedMatrix + (i*width) + j) = (float)(rand() % (10) + 1); //rand() / (T)RAND_MAX;;
}
}
}
template <typename T>
void print_matrix(T* matrix, int width, int height){
int len = width*height;
cout << "asdsffffff" << endl;
for (int i = 0; i < len; i++){
if (!(i%width))
cout << endl;
cout << i << ":" <<matrix[i] << " ";
}
cout << endl;
}
template <typename T>
void computeMatrixVectorMultiplication(T* bandedMatrix, T* vector2){
T row_sum = 0;
T* bandedHostResult = (T*)malloc(WIDTH * sizeof(T));
for (int i = 0; i < HEIGHT; i++){
row_sum = 0;
for (int j = 0; j < WIDTH; j++){
row_sum += (*(bandedMatrix + i*WIDTH + j)) * vector2[j];
}
bandedHostResult[i] = row_sum;
}
//priting the result
cout << "\n\nBanded Host Result...\n";
print_vector(bandedHostResult, WIDTH);
}
template <typename T>
void fillLapackMatrix(T* lapack_matrix, T* bandedMatrix, int kl, int ku, int banded_w, int banded_h, int lapack_w, int lapack_h){
int i, j, lapack_i;
int len = lapack_h * lapack_w;
for (i = 0; i < len; i++){
lapack_matrix[i] = 0; //generates number between 1-10
}
for (i = 0; i < banded_w; i++){
for (j = 0; j < banded_h; j++){
lapack_i = ku + i - j;
*(lapack_matrix + lapack_i*lapack_w + j) = *(bandedMatrix + i*banded_w + j);
//lapack_matrix[lapack_i*lapack_w + j] = bandedMatrix[i*bandedMatrix + j];
}
}
}
__global__ void device_cublasSgbmv(int m,int n,int kl, int ku,float* alpha, float* A, int lda ,float* B,int ldb,float*R, int ldr, float* beta){
int index = blockIdx.x * blockDim.x + threadIdx.x;
cublasHandle_t handle;
cublasCreate(&handle);
cublasOperation_t trans = CUBLAS_OP_N;
float* dev_x;
cudaMalloc((void**)&dev_x,sizeof(float) * n);
if(index < ldr){
cublasSgbmv(handle, trans,m, n, kl, ku, alpha, A, m, B+index*n, 1, beta, R+index*n, 1);
index = 0;
}
}
void fillNormalMatrix(float* B,int h,int w){
for(int i = 0; i < h;i++){
for(int j = 0; j < w;j++){
B[i*w + j] = 1;
}
}
}
int main()
{
cublasStatus_t status;
float *A;
float *x, *y;
float *dev_x, *dev_y;
int incx, incy;
float *dev_A = 0;
float alpha = 1.0f;
float beta = 0.0f;
int matrixSize = WIDTH * HEIGHT;
int i, j;
cublasHandle_t handle;
/* Initialize CUBLAS */
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
//Allocate host memory for the matrices
A = (float *)malloc(matrixSize* sizeof(float));
//Allocate memory for host vectors
x = (float *)malloc(WIDTH * sizeof(float));
y = (float*)malloc(WIDTH * sizeof(float));
// Fill the matrices with test data
creating_bandedMatrix(A, WIDTH, HEIGHT, KU, KL);
cout << "Banded Matrix\n";
print_matrix(A, WIDTH, HEIGHT);
//Fill the vectors with random data
for (i = 0; i < WIDTH; i++){
x[i] = 1;// (float)(rand() % (10) + 1);:
y[i] = (float)(rand() % (10) + 1);
}
cout << "\nvector x...\n";
print_vector(x, WIDTH);
//cout << "\nvector y...\n";
//print_vector(y, WIDTH);
//Allocate device memory for the matrix
if (cudaMalloc((void **)&dev_A, matrixSize * sizeof(float)) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
//Allocate device memory for vectors
if (cudaMalloc((void**)&dev_x, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&dev_y, WIDTH * sizeof(float)) != cudaSuccess){
fprintf(stderr, "Device Vector Allocation PROBLEM\n");
return EXIT_FAILURE;
}
// Initialize the device vectors with the host vectors
status = cublasSetVector(WIDTH, sizeof(float), x, 1, dev_x, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write x vector)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(WIDTH, sizeof(float), y, 1, dev_y, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write y vector)\n");
return EXIT_FAILURE;
}
//initialize matrix with lapack format
int lapack_width = WIDTH > HEIGHT ? HEIGHT : WIDTH;
int lapack_height = KL + KU + 1;
int lapackSize = lapack_height * lapack_width;
float* lapack_matrix = (float*)malloc(lapackSize * sizeof(float));
fillLapackMatrix(lapack_matrix, A, KL, KU, WIDTH, HEIGHT, lapack_width, lapack_height);
cout << "\n\nLAPACK MAtrix\n";
print_matrix(lapack_matrix, lapack_width, lapack_height);
//convert to column column matrix
float* col = (float*)malloc(lapackSize * sizeof(float));
for (i = 0; i < WIDTH; i++){
for (j = 0; j < HEIGHT; j++){
col[i + WIDTH*j] = lapack_matrix[WIDTH*i + j];
}
}
cout << "Lapack Column Based Matrix\n";
print_matrix(col,HEIGHT-1,WIDTH);
//status = cublasSetVector(lapackSize, sizeof(float), A, 1, dev_A, 1);
cublasSetMatrix(HEIGHT, WIDTH, sizeof(float), col, HEIGHT, dev_A, HEIGHT);
cublasOperation_t trans = CUBLAS_OP_N;
incy = incx = 1;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////// Banded Matrix Matrix Multipllicatio ///////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
float* B,*dev_B,*dev_R,*R;
B = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
R = (float*)malloc(WIDTH*HEIGHT*sizeof(float));
fillNormalMatrix(B,WIDTH,HEIGHT);
cudaMalloc((void**)&dev_B,matrixSize*sizeof(*B));
cudaMalloc((void**)&dev_R,matrixSize*sizeof(*R));
cublasSetMatrix(HEIGHT, WIDTH, sizeof(*B), B, HEIGHT, dev_B, HEIGHT);
cout << "Matrix B\n";
print_matrix(B,HEIGHT,WIDTH);
cout << "gfsdf\n";
device_cublasSgbmv<<<1,4>>>(HEIGHT, WIDTH, KL, KU, &alpha, dev_A, WIDTH, dev_B, HEIGHT, dev_R, HEIGHT,&beta);
cout << "after\n";
cublasGetMatrix(HEIGHT,WIDTH, sizeof (*R) ,dev_R ,WIDTH,R,WIDTH);
getchar();
return 0;
}
and compile it like :
nvcc -gencode=arch=compute_35,code=sm_35 -lcublas -lcudadevrt -O3 Source.cu -o Source.o -dc
g++ Source.o -lcublas -lcudart
then, I get the following :
In function `__sti____cudaRegisterAll_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a()':
tmpxft_00001f1e_00000000-3_lapack_vector.cudafe1.cpp:(.text.startup+0x575): undefined reference to `__cudaRegisterLinkedBinary_48_tmpxft_00001f1e_00000000_6_Source_cpp1_ii_ebe2258a'
collect2: error: ld returned 1 exit status
You can compile and link the code you have now shown with a single command like this:
nvcc -arch=sm_35 -rdc=true -lcublas -lcublas_device -lcudadevrt -o test Source.cu
You may get some warnings like this:
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sgemm.asm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_sgemm.o'
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a:maxwell_sm50_ssyrk.o'
Those can be safely ignored.