When I try to compile the following code, I get the following errors:
hmm.cpp:16:29: error: ‘double gamma [3000][4]’ redeclared as different kind of symbol
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:266:1: error: previous declaration of >‘double gamma(double)’
hmm.cpp: In function ‘double updateModel(int&, int, int, double, double, int, double*, >double ()[4], double ()[5005], double*)’:
hmm.cpp:67:11: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:67:14: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:67:18: error: assignment of function ‘double gamma(double)’
hmm.cpp:67:18: error: cannot convert ‘int’ to ‘double(double)throw ()’ in assignment
hmm.cpp:69:12: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:69:15: warning: pointer to a function used in arithmetic [-Wpointer-arith]
hmm.cpp:69:46: error: invalid operands of types ‘double(double)throw ()’ and ‘double’ to >binary ‘operator+’
hmm.cpp:69:46: error: in evaluation of ‘operator+=(double(double)throw (), double)’
I get similar errors everytime gamma is used in the code.
Code follows:
#include <iostream>
#include <fstream>
#include <cstring>
#include <cstdlib>
#include <cmath>
//double atof(const char* str)
using namespace std;
#define MAXT 3000
#define MAXSTATE 4
#define MAXRANGE 5005
#define maxString 52
#define maxResult 405
double alpha[MAXT][MAXSTATE];
double beta[MAXT][MAXSTATE];
double gamma [MAXT][MAXSTATE];
double delta[MAXT][MAXSTATE];
double psi[MAXT][MAXSTATE];//Ψ
inline int getIndex(const double& value,const double& min,const double&
max,const int& k)
int ret;
//ret = int((value - min)*((max-min)/k)); // [possible error 1]
ret = (k - 1)*(value - min) / (max-min);
return ret;
// all the matrix start from 1 to max
// oMin is the minimal value of O
double updateModel(int& q,int tWindow, int oRange, double oMin, double oMax, int
stateNum, double _o[MAXT],double _A[MAXSTATE][MAXSTATE],double _B[MAXSTATE][MAXRANGE],double _Pi[MAXSTATE])
double p;
/* calculate lambda */
// alpha
for(int s=1;s<=stateNum;s++)
alpha[1][s] = _Pi[s]*_B[s][getIndex(_o[1], oMin, oMax, oRange)];
for(int t=2;t<=tWindow;t++)
for(int s=1;s<=stateNum;s++)
alpha[t][s] = 0;
for(int j=1;j<=stateNum;j++)
alpha[t][s] += alpha[t-1][j] * _A[j][s] * _B[s][getIndex(_o[t], oMin, oMax, oRange)];
// p
p = 0;
for(int i=1;i<=stateNum;i++)
for(int s = 1; s <= stateNum; s++)
beta[tWindow][s] = 1;
for(int t = tWindow - 1; t >= 1; t--)
for(int s = 1; s <= stateNum; s++)
beta[t][s] = 0;
for(int j=1;j<=stateNum;j++)
beta[t][s] += beta[t + 1][j] * _A[j][s] * _B[s][getIndex(_o[t + 1], oMin, oMax, oRange)];
for (int t = 1; t <= tWindow; t ++){
for (int i = 1; i <= stateNum; i ++){
gamma[t][i] = 0;
for (int s = 1; s <= stateNum; s ++){
gamma[t][i] += (alpha[t][s] * beta[t][s]);
gamma[t][i] = alpha[t][i] * beta[t][i] / gamma[t][i];
//delta, psi
for (int i = 1; i <= stateNum; i ++){
delta[1][i] = _Pi[i] * _B[i][getIndex(_o[1], oMin, oMax, oRange)];
psi[1][i] = 0;
for (int t = 2; t <= tWindow; t ++){
for (int i = 1; i <= stateNum; i ++){
int k = 1;
delta[t][1] = delta[t - 1][1] * _A[1][i] * _B[i][getIndex(_o[t], oMin, oMax, oRange)];
for (int j = 2; j <= stateNum; j ++)
if ((delta[t - 1][j] * _A[j][i]) > (delta[t - 1][k] *
_A[k][i]) )
delta[t][i] = delta[t - 1][j] * _A[j][i] *
_B[i][getIndex(_o[t], oMin, oMax, oRange)];
k = j;
psi[t][i] = k;
int k = 1;
double p_star = delta[tWindow][1];
for (int i = 1; i <= stateNum - 1; i ++)
if (delta[tWindow][i + 1] > delta[tWindow][k])
p_star = delta[tWindow][i + 1];
k = i + 1;
int q_star = k;
for (int t = 1; t <= tWindow - 1; t ++)
for (int i = 1; i <= stateNum; i ++)
for (int j = 1; j <= stateNum; j ++)
xi[t][i][j] = 0;
for (int s1 = 1; s1 <= stateNum; s1 ++)
for (int s2 = 1; s2 <= stateNum; s2 ++)
xi[t][i][j] = xi[t][i][j] + beta[t + 1][s2]
* _B[s2][getIndex(_o[t + 1], oMin, oMax, oRange)] * _A[s1][s2] * alpha [t][s1];
xi[t][i][j] = beta[t + 1][j] * _B[j][getIndex(_o[t + 1],
oMin, oMax, oRange)] * _A[i][j] * alpha [t][i] / xi[t][i][j];
for (int i = 1; i <= stateNum; i ++)
_Pi[i] = gamma[1][i];
for (int j = 1; j <= stateNum; j ++)
double numerator = 0;
double denominator = 0;
for (int t = 1; t <= tWindow - 1; t ++)
numerator += xi[t][i][j];
denominator += gamma[t][i];
_A[i][j] = numerator / denominator;
double tmp,detmp;
for(int k=1; k<=oRange; k++)
tmp = 0;
detmp = 0;
for(int t=1; t<=tWindow; t++)
if(getIndex(_o[t], oMin, oMax, oRange) == k ) tmp+=gamma[t][i];
_B[i][k] = tmp/detmp;
q = q_star;
return p;
//double _A[maxState][maxState],double _B[maxState][MAXRANGE],double _Pi[maxState]
void converge(int& q, double previousP,double threshold, int tWindow, int
maxRange, double oMin, double oMax, int stateNum, double _o[MAXT],double _A[MAXSTATE][MAXSTATE],double _B[MAXSTATE][MAXRANGE],double _Pi[MAXSTATE])
double currentP = updateModel(q, tWindow,maxRange,oMin,oMax,stateNum, _o,
previousP = currentP;
currentP = updateModel(q, tWindow,maxRange,oMin,oMax,stateNum, _o,
int main()
ifstream fin1("..\\data\\input.txt");
ifstream fin2("..\\data\\input2.txt");
ofstream fout("..\\data\\output.txt");
double result[maxResult];
double _o[MAXT];
double _Pi[MAXSTATE];
int oRange;
int nState;
double oMin;
double oMax;
int tWindow;
Begin- Input data
string tnum;
char tmps[maxString];
double t;
int cnt1, cnt2;
int cnttmp;
/* Get the num of input1 and input2 */
t = atof(tmps);
cnt1 = int(t);
t = atof(tmps);
cnt2 = int(t);
/* Get the real data of input1 and input2 */
cnttmp = 1;
oMin = oMax = 0;
t = atof(tmps);
_o[cnttmp++] = t;
if(oMin > t) oMin = t;
if(oMax < t) oMax = t;
// printf("1: %lf\n",t);
//printf("oMin = %lf, oMax = %lf\n",oMin, oMax);
t = atof(tmps);
_o[cnttmp++] = t;
//printf("2: %lf\n",t);
End- Input data
Parameters to set:
int oRange;
int tWindow;
int maxRange = 5000;
tWindow = 70;
nState = 3;
double previousP = 0;
double threshold = 1e-8;
// [To do]
for(int i=1;i<=nState;i++)
for(int j=1;j<=nState;j++)
_A[i][j] = (1.0)/ nState;
for(int i=1;i<=nState;i++)
for(int j=1;j<=maxRange;j++)
_B[i][j] = (1.0)/maxRange;
for(int i=1;i<=nState;i++)
_Pi[i] = (1.0)/nState;
Begin- Process data
int q_star;
converge(q_star,previousP,threshold, tWindow, maxRange, oMin, oMax, 3,
int bestIndex = 1; // the index of O(T+1)
int tmp;
int choice;
double predictValue,currentValue;
double bestValue;
for(int k=1;k<=cnt2;k++) // cnt2 Real Data
currentValue = _o[cnt1+k-1];
bestValue = 0;
for(int i=1;i<=maxRange;i++)
//tmp = getIndex(_o[cnt1+k], oMin, oMax, maxRange);
if(_B[q_star][i] > bestValue)
bestValue = _B[q_star][i];
bestIndex = i;
predictValue = oMin + (oMax - oMin) * (bestIndex-1) /(maxRange-1);
//index --> value
converge(q_star,previousP,threshold, tWindow, maxRange, oMin, oMax,
3, _o+k,_A,_B,_Pi);
if(predictValue > currentValue) choice = 1;
else choice = -1;
result[k] = choice * (_o[cnt1+k] - _o[cnt1+k-1]);
End- Process data
Begin- Output data
for(int i=1;i<=cnt2;i++)
fout << result[i] << endl;
End- Output data
return 0;
Could someone tell me how to fix this error?
Thank you.
The error message is pretty clear:
mathcalls.h:266:1: error: previous declaration of >‘double gamma(double)’
There is a function double gamma(double) that you get when importing cmath.
Change the name of your array.
Your variable gamma conflicts with a symbol defined in mathcalls.h, a prototype for the gamma function.
I have tried to extract patches from an image parallelly with pixel shift/overlapping. I have written the CPU version of the code. But I could not able to convert the for loop which has an increment of pixel shift. I have given the part of the code where for loop is being used. CreatePatchDataSet function has the "for loop " which has an increment of pixel shift. Please help me out to convert this function into Cuda. I have provided the following code.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>
using namespace std;
using namespace cv;
#define PATCH_SIZE (5)
#define PIXEL_SHIFT (2)
void ConvertMat2DoubleArray(cv::Mat input, double* output)
for (int i = 0; i < input.rows; i++)
double *src = input.ptr<double>(i);
for (int j = 0; j < input.cols; j++)
output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {
*num_of_patch_col = 0;
int len_nb = 0;
while (len_nb < width) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
else {
len_nb += patch_size;
len_nb = 0;
*num_of_patch_row = 0;
while (len_nb < height) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
else {
len_nb += patch_size;
*num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int counter_row = 0;
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
for (int i = 0; i < height; i += pixel_shift) {
int counter_col = 0;
for (int j = 0; j < width; j += pixel_shift) {
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
if (counter_col == num_of_patch_col) {
if (counter_row == num_of_patch_row) {
int main()
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0,
double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
return 0;
The results I got for first 10 values in CPU version:
patch numbers:
I have tried to convert this function to Kernel function using cuda:. But it goes into the infinite loop. As I am very new to this CUDA field, could you please help me to find out the problem in the code ?
__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
int i = threadIdx.x + (blockDim.x*blockIdx.x);
int j = threadIdx.y + (blockDim.y*blockIdx.y);
while (i<height && j< width)
int counter_row = 0;
int counter_col = 0;
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
if (counter_col == num_of_patch_col) {
if (counter_row == num_of_patch_row) {
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
int main()
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
double *d_orgimageH;
gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));
double *d_FY;
gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH * num_of_patch_image));
gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH , sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH * num_of_patch_image, cudaMemcpyDeviceToHost));
// cout<<"Hello world";
return 0;
Image I have used: [1]: https://i.stack.imgur.com/Ywg7p.png
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
is outside the while loop in your kernel. As i and j never change inside the while loop, it isn't stopping. There could be more problems here, but this is the most prominent one.
EDIT: Another one that I found, is that you have only one while over both i and j instead of one for each. You should probably use for loops like in your CPU code:
for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x));
i < height;
i += pixel_shift * blockDim.x * gridDim.x) {
for (j = ...; j < ...; j += ...) {
/* ... */
I could imagine this to be a good idea:
for (counter_row = threadIdx.y + blockDim.y * blockIdx.y;
counter_row < num_of_patch_row;
counter_row += blockDim.y * gridDim.y) {
i = counter_row * pixel_shift;
if (i > height)
for (counter_col = threadIdx.x + blockDim.x * blockIdx.x;
counter_col < num_of_patch_col;
counter_col += blockDim.x * gridDim.x) {
j = counter_col * pixel_shift;
if (j > width)
/* ... */
I have also exchanged the x/y fields of the execution parameters between the inner and the outer loop, as it seemed more appropriate considering that the x field is continuous in warps (memory access benefits).
I was making a ray tracer and in the header it gave me this error c++ in the header file. It has to do with the perlin_interp return. Help me please! thanks
the two errors:
Error C2662 'double perlin::perlin_interp(vec3 [][2][2],double,double,double)': cannot convert 'this' pointer from 'const perlin' to 'perlin &' RayTracer c:\users\graha\source\repos\raytracer\perlin.h 57
Severity Code Description Project File Line Suppression State
Error (active) E1086 the object has type qualifiers that are not compatible with the member function "perlin::perlin_interp" RayTracer C:\Users\graha\source\repos\RayTracer\perlin.h 57
#include "vec3.h"
#include "rtweekend.h"
#include <random>
inline double trilinear_interp(double c[2][2][2], double u, double v, double w) {
auto accum = 0.0;
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
for (int k = 0; k < 2; k++)
accum += (i*u + (1 - i)*(1 - u))*
(j*v + (1 - j)*(1 - v))*
(k*w + (1 - k)*(1 - w))*c[i][j][k];
return accum;
class perlin {
perlin() {
ranvec = new vec3[point_count];
for (int i = 0; i < point_count; ++i) {
ranvec[i] = unit_vector(vec3::random(-1, 1));
perm_x = perlin_generate_perm();
perm_y = perlin_generate_perm();
perm_z = perlin_generate_perm();
~perlin() {
delete[] ranvec;
delete[] perm_x;
delete[] perm_y;
delete[] perm_z;
double noise(const point3 p) const{
auto u = p.x() - floor(p.x());
auto v = p.y() - floor(p.y());
auto w = p.z() - floor(p.z());
int i = floor(p.x());
int j = floor(p.y());
int k = floor(p.z());
vec3 c[2][2][2];
for (int di = 0; di < 2; di++)
for (int dj = 0; dj < 2; dj++)
for (int dk = 0; dk < 2; dk++)
c[di][dj][dk] = ranvec[
perm_x[(i + di) & 255] ^
perm_y[(j + dj) & 255] ^
perm_z[(k + dk) & 255]
return perlin_interp(c, u, v, w); //problem area
vec3* ranvec;
static const int point_count = 256;
double* ranfloat;
int* perm_x;
int* perm_y;
int* perm_z;
static int* perlin_generate_perm() {
auto p = new int[point_count];
for (int i = 0; i < perlin::point_count; i++)
p[i] = i;
permute(p, point_count);
return p;
static void permute(int* p, int n) {
for (int i = n - 1; i > 0; i--) {
int target = random_int(0, i);
int tmp = p[i];
p[i] = p[target];
p[target] = tmp;
inline double perlin_interp(vec3 c[2][2][2], double u, double v, double w) {
auto uu = u * u*(3 - 2 * u);
auto vv = v * v*(3 - 2 * v);
auto ww = w * w*(3 - 2 * w);
auto accum = 0.0;
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
for (int k = 0; k < 2; k++) {
vec3 weight_v(u - i, v - j, w - k);
accum += (i*uu + (1 - i)*(1 - uu))
* (j*vv + (1 - j)*(1 - vv))
* (k*ww + (1 - k)*(1 - ww))
* dot(c[i][j][k], weight_v);
return accum;
My C++ code (shown below) works on this site:
GDB Online but not in Visual Studio, where it crashes at
iterations[imag_times][real_times] = i % (iter / 2);
when imag_times is 1 and real_times is 0 with the exception being Exception has occurred. Segmentation fault
I have installed GDB version 7.6.1.
My Question: Does anybody know how to fix that and why this is happening?
#include <iostream>
using namespace std;
int main()
// initialization
const double real_min = -1;
const double real_max = 1;
const double imag_min = -1;
const double imag_max = 1;
const int iter = 30;
const double real_offs = 0.01;
const double imag_offs = 0.01;
double z_real = 0;
double z_imag = 0;
double c_real = real_min;
double c_imag = imag_max;
int real_times = 0;
int imag_times = 0;
int** iterations = new int*[1];
iterations[0] = new int;
int i = 0;
// start
while(c_imag >= imag_min)
iterations = (int**)realloc(iterations, sizeof(int*) * (imag_times + 1));
real_times = 0;
c_real = real_min;
while(c_real <= real_max)
iterations[imag_times] = (int*)realloc(iterations[imag_times], sizeof(int) * (real_times + 1));
z_real = 0;
z_imag = 0;
for(i = 0; i < iter; i++)
double z_imag2 = z_imag * z_imag;
z_imag = 2 * z_real * z_imag + c_imag;
z_real = z_real * z_real - z_imag2 + c_real;
if(z_real * z_real + z_imag * z_imag > 4)
iterations[imag_times][real_times] = i % (iter / 2);
c_real = real_min + real_offs * real_times;
c_imag = imag_max - imag_offs * imag_times;
// output
for(int i = 0; i < imag_times; i++)
for(int j = 0; j < real_times; j++)
cout << iterations[i][j];
cout << ",";
cout << "\n";
cout << "done";
std::cin.get(); // pause so the program doesnt exit instantly
return 0;
Thanks in advance!
I have used openm to parallelize my c++ code as below:
int shell_num = 50, grparallel[shell_num],grbot[shell_num];
double p_x,p_y,grp[shell_num];
for (int f = 0; f < shell_num; f++)
grp[f] = 0;
grparallel[f] = 0;
grbot[f] = 0;
//some code...
#pragma omp parallel for reduction(+ : grp,grparallel,grbot)
for(int i = 0; i < N; i++){ //some code
for(int j = 0; j < N; j++){
if (j==i) continue;
double delta_x = x[i]-x[j],
delta_y = y[i]-y[j],
e_dot_e = e_x[i] * e_x[j] + e_y[i] * e_y[j],
e_cross_e = e_x[i] * e_y[j] - e_y[i] * e_x[j];
if (j > i)
double fasele = sqrt(dist(x[i],y[i],x[j],y[j],L));
for (int h = 0; h < shell_num; h++) //determine periodic distance between i and j is in which shel
if( L * h / 100 < fasele && fasele < L * (h + 1) / 100 )
{grp[h]+= e_dot_e;
double pdotr = abs(periodic(delta_x,L) * p_x + periodic(delta_y,L) * p_y)/fasele;
if (pdotr > 0.9659)
grparallel[h]+= 1;}else if(pdotr < 0.2588)
grbot[h]+= 1;
When I run the code in terminal, there is an error:
‘grp’ has invalid type for ‘reduction’
The same error occurs for grparallel and grbot.
How can I remove the error?
I implemented the Jacobi algorithm using TBB and it works just fine. Then I parallelized the convergence calculation using a reduction, but for some reason if I use more than 1 logical core I get an segmentation fault and i can't figure out why.
I can use more than 1 thread on a system that has only 1 logical core.
The same implementation using OpenMP works without a hassle
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/task_scheduler_init.h>
#include <tbb/tick_count.h>
// ----------------------------------------------------------------
#define SIZE 1024
#define RESIDUO 0.0009f*SIZE
#define THREADS 2
using namespace tbb;
// ----------------------------------------------------------------
struct Sum {
float ret;
float (*a)[SIZE];
float (*x);
float (*b);
Sum(float A[SIZE][SIZE], float X[SIZE], float B[SIZE]) : ret(0), a(A), x(X), b(B) {}
Sum( Sum&, split ) {ret = 0;}
void operator()( const blocked_range<int>& r ) {
float temp = ret;
for( int i = r.begin(); i != r.end(); i++ ) {
float sum = 0.0f;
for(int j = 0; j < SIZE; j++){
sum += a[i][j] * x[j];
temp += pow(b[i] - sum, 2);
ret = temp;
void join( Sum& rhs ) {ret += rhs.ret;}
// || b - Ax ||
int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){
Sum total(a, x, b);
parallel_reduce( blocked_range<int>(0, SIZE), total );
float norm = sqrt(total.ret);
printf("Ret: %f | Residuo: %f\n", total.ret, norm);
return (norm <= RESIDUO);
// ----------------------------------------------------------------
float randomFloat()
float r = (float)rand()/(float)RAND_MAX;
return r;
// ----------------------------------------------------------------
int check_ddm(float (*a)[SIZE]){
float sum = 0.0f;
int i = 0, j = 0;
for(i = 0; i < SIZE; i++){
sum = 0.0f;
for(j = 0; j < SIZE; j++){
if(i != j){
sum += a[i][j];
if(a[i][i] < sum){
printf("line: %d, sum: %f, a[i][i]: %f \n", i, sum, a[i][i]);
for(j = 0; j < SIZE; j++){
if(i != j) printf("%f ", a[i][j]);
else printf("(%f) ", a[i][j]);
return 0;
return 1;
// ----------------------------------------------------------------
int generate_ddm(float (*a)[SIZE], float *b)
int i = 0, j = 0;
float line = 0.0f;
for(i = 0; i < SIZE; i++){
line = 0.0f;
for(j = 0; j < SIZE; j++){
if(i != j){
a[i][j] = randomFloat();
line += a[i][j];
a[i][i] = SIZE;
b[i] = line + SIZE;
return check_ddm(a);
// ----------------------------------------------------------------
int main( )
float (*x)[SIZE] = (float(*)[SIZE])malloc(sizeof *x * 2);
float (*a)[SIZE] = (float(*)[SIZE])malloc(sizeof *a * SIZE);
float (*b) = (float*)malloc(sizeof(float) * SIZE);
int i = 0, j = 0;
float delta = 0.0f;
int read = 0;
int write = 1;
tbb::task_scheduler_init init(THREADS);
// set up initial solution
for(i = 0; i < SIZE; i++){
x[0][i] = i;
x[1][i] = i;
// generate a diagonal dominant matrix
if(!generate_ddm(a, b)){
printf("Array generated is not ddm!\n");
return 1;
tick_count startTime = tick_count::now();
while(!converge(a, x[write], b)){
read = !read;
write = !write;
[&] (const blocked_range<int>& r) {
for (int i = r.begin(); i < r.end(); i++) {
float delta = 0.0f;
for(int j = 0; j < SIZE; j++){
if(j != i){
delta += a[i][j] * x[read][j];
x[write][i] = (b[i] - delta) / a[i][i];
tick_count lastTime = tick_count::now();
float walltime = (lastTime - startTime).seconds();
printf("tbb %f\n", walltime);
converge(a, x[write], b);
printf("x0: %f | x%d: %f\n", x[write][0], SIZE-1, x[write][SIZE-1]);
return 0;
The segfault occurs on the following line inside the Sum class:
sum += a[i][j] * x[j];
And if I change that line to
float tmpa = a[i][j];
float tmpx = x[j];
sum += tmpa * tmpx;
The error continues to be on
sum += tmpa * tmpx;
In the original version, the "splitting constructor" left a, x, and b undefined. They need to be copied from the incoming Sum& argument. E.g., change the splitting constructor to:
Sum( Sum& s, split ) {a=s.a; b=s.b; x=s.x; ret = 0;}
Changing the Class to a lambda expression solved the problem. It maybe a bug in TBB's parallel_reduce
int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){
float val = 0.0f;
val = parallel_reduce(
blocked_range<int>(0, SIZE),
[&]( const blocked_range<int>& r, float init )->float {
float temp = init;
for(int i = r.begin(); i != r.end(); i++ ) {
float sum = 0.0f;
for(int j = 0; j < SIZE; j++){
sum += a[i][j] * x[j];
temp += pow(b[i] - sum, 2);
return temp;
[]( float x, float y)->float{
return x+y;
float norm = sqrt(val);
printf("Ret: %f | Residuo: %f\n", val, norm);
return (norm <= RESIDUO);