I've been trying to create an openMP variant of the julia set, but I'm unable to create a coherent image when running more than one thread, I've been trying to solve what looks like a race condition but cannot find the error.
The offending output looks like the required output along with "scanlines" across the entirety of the picture.
I've attached the code as well if its not clear enough.
#include <iostream>
#include <math.h>
#include <fstream>
#include <sstream>
#include <omp.h>
#include <QtWidgets>
#include <QElapsedTimer>
using namespace std;
double newReal(int x, int imageWidth){
return 1.5*(x - imageWidth / 2)/(0.5 * imageWidth);
}
double newImaginary(int y, int imageHeight){
return (y - imageHeight / 2) / (0.5 * imageHeight);
}
int julia(double& newReal, double& newImaginary, double& oldReal, double& oldImaginary, double cRe, double cIm,int maxIterations){
int i;
for(i = 0; i < maxIterations; i++){
oldReal = newReal;
oldImaginary = newImaginary;
newReal = oldReal * oldReal - oldImaginary * oldImaginary + cRe;
newImaginary = 2 * oldReal * oldImaginary + cIm;
if((newReal * newReal + newImaginary * newImaginary) > 4) break;
}
return i;
}
int main(int argc, char *argv[])
{
int fnum=atoi(argv[1]);
int numThr=atoi(argv[2]);
// int imageHeight=atoi(argv[3]);
// int imageWidth=atoi(arg[4]);
// int maxIterations=atoi(argv[5]);
// double cRe=atof(argv[3]);
// double cIm=atof(argv[4]);
//double cRe, cIm;
int imageWidth=10000, imageHeight=10000, maxIterations=3000;
double newRe, newIm, oldRe, oldIm,cRe,cIm;
cRe = -0.7;
cIm = 0.27015;
string fname;
QElapsedTimer time;
QImage img(imageHeight, imageWidth, QImage::Format_RGB888);//Qimagetesting
img.fill(QColor(Qt::black).rgb());//Qimagetesting
time.start();
int i,x,y;
int r, gr, b;
#pragma omp parallel for shared(imageHeight,imageWidth,newRe,newIm) private(x,y,i) num_threads(3)
for(y = 0; y < imageHeight; y++)
{
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrt(i) % 256);
b = (i % 256);
img.setPixel(x, y, qRgb(r, gr, b));
}
}
//stringstream s;
//s << fnum;
//fname= "julia" + s.str();
//fname+=".png";
//img.save(fname.c_str(),"PNG", 100);
img.save("julia.png","PNG", 100);
cout<< "Finished"<<endl;
cout<<time.elapsed()/1000.00<<" seconds"<<endl;
}
As pointed in comments, you have two main problems:
newRe and newIm are shared, but should not be
r, gr and b's access is not specified (shared by default I think)
There is concurrent calls to QImage::setPixel
To correct this, do not hesitate to make a omp for loop nested in a omp parallel block.
Declare private variable just before the for loop:
To prevent concurrent calls to QImage::setPixel, since this function is not thread safe, you can put it in a critical region, with #pragma omp critical.
int main(int argc, char *argv[])
{
int imageWidth=1000, imageHeight=1000, maxIterations=3000;
double cRe = -0.7;
double cIm = 0.27015;
QElapsedTimer time;
QImage img(imageHeight, imageWidth, QImage::Format_RGB888);//Qimagetesting
img.fill(Qt::black);
time.start();
#pragma omp parallel
{
/* all folowing values will be private */
int i,x,y;
int r, gr, b;
double newRe, newIm, oldRe, oldIm;
#pragma omp for
for(y = 0; y < imageHeight; y++)
{
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrtf(i) % 256);
b = (i % 256);
#pragma omp critical
img.setPixel(x, y, qRgb(r, gr, b));
}
}
}
img.save("julia.png","PNG", 100);
cout<<time.elapsed()/1000.00<<" seconds"<<endl;
return 0;
}
To go further, you can save some cpu time replacing ::setPixel by ::scanLine:
#pragma omp for
for(y = 0; y < imageHeight; y++)
{
uchar *line = img.scanLine(y);
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrtf(i) % 256);
b = (i % 256);
*line++ = r;
*line++ = gr;
*line++ = b;
}
}
EDIT:
Since the julia set seems to have a central symetry around (0,0) point, you can perfom only half of calculus:
int half_heigt = imageHeight / 2;
#pragma omp for
// compute only for first half of image
for(y = 0; y < half_heigt; y++)
{
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrtf(i) % 256);
b = (i % 256);
#pragma omp critical
{
// set the point
img.setPixel(x, y, qRgb(r, gr, b));
// set the symetric point
img.setPixel(imageWidth-1-x, imageHeight-1-y, qRgb(r, gr, b));
}
}
}
Related
Now, l am trying to accelerate the calculation of center of mass in different ROI areas.Her is my original code:
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv/cv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "omp.h"
#include <time.h>
#include <ctime>
#define File_SubAperture "SubAperture.txt"
#define Row_Subapaerture 750
Mat src;
Mat src_gray;
using namespace cv;
using namespace std;
int thresh = 0;
int max_thresh = 255;
Mat ROI;
int nn = 0;
float subApX[751] = { 0 };
float subApY[751] = { 0 };
float x = 0.0;
float y = 0.0;
float k = 0.0;
float f = 0.0;
int main()
{
clock_t startTime, endTime;
int SubAperture[Row_Subapaerture][Col_Subaperture];
double data1, data2, data3, data4;
int i;
Rect rect_subaperture[Row_Subapaerture];
FILE * fp_SubAperture;
FILE* px;
px = fopen("C:\\Users\\DELL\\Desktop\\AO acceleration\\center-coordinates-x.txt", "w+");
FILE* py;
py=fopen("C:\\Users\\DELL\\Desktop\\AO acceleration\\center-coordinates-y.txt", "w+");
//---read file Sub-aperture.txt---*
fp_SubAperture = fopen(File_SubAperture, "r");
if (fp_SubAperture == NULL)
{
perror("Couldn't open the file " File_SubAperture);
exit(1);
}
for (i = 0; fscanf(fp_SubAperture, "%lf%lf%lf%lf", &data1, &data2, &data3, &data4) != EOF; ++i)
{
SubAperture[i][0] = (int)data1;
SubAperture[i][1] = (int)data2;
SubAperture[i][2] = (int)data3;
SubAperture[i][3] = (int)data4;
}
fclose(fp_SubAperture);
//read image
float sumval = 0.0;
MatIterator_<uchar> it, end;
src = imread("WFS_29x29-circle.png", CV_LOAD_IMAGE_COLOR);
cvtColor(src, src_gray, CV_BGR2GRAY);
//imshow("gray image", src_gray);
//calculate the ROI area in advance
for (i = 0; i < 749; i++)
{
rect_subaperture[i].x = SubAperture[i][0];
rect_subaperture[i].y = SubAperture[i][1];
//rect_subaperture[i].width = SubAperture[i][2] - SubAperture[i][0];
//rect_subaperture[i].height = SubAperture[i][3] - SubAperture[i][1];
rect_subaperture[i].width = 4;
rect_subaperture[i].height = 4;
}
startTime = clock();// time start
omp_set_num_threads(2);
#pragma omp parallel private(ROI,it,i,k,f ) firstprivate(sumval,x,y) shared(src_gray,subApX,subApY,rect_subaperture)
#pragma omp for nowait schedule(guided) collapse(2)
for(i=0; i<749;i++)
{
ROI = src_gray(rect_subaperture[i]);
for (it = ROI.begin<uchar>(), end = ROI.end<uchar>(); it != end; it++)
{
((*it) > thresh) ? sumval += (*it) : NULL;
// printf("sum = %f\n", sumval);
}
for (int k = 0; k < ROI.cols; k++)
{
for (int f = 0; f < ROI.rows; f++)
{
float S = ROI.at<uchar>(f, k);
if (S < thresh)
S = 0;
x += (k * S) / sumval;
y += (f * S) / sumval;
}
}
subApX[i]= x + SubAperture[i][0];
subApY[i]= y + SubAperture[i][1];
fprintf(px, "\n%f", subApX[i]);
fprintf(py, "\n%f", subApY[i]);
}
endTime = clock();
printf("time = %f\n", (double)(endTime - startTime) / CLOCKS_PER_SEC);
return 0;
}
As you see, I must use multi for loops to finish the calculation. However, the whole code will stuck without any errors, only 50% of calculation is finished, the rest of them can't be calculated.
Anyone knows what's the problems I met? and how to speed up my code. The goal of this code is to calculate the center of mass in different ROI ares of one image. The coordinates will be saved as .TXT files and code will calculate the time needed.
I have tried to extract patches from an image parallelly with pixel shift/overlapping. I have written the CPU version of the code. But I could not able to convert the for loop which has an increment of pixel shift. I have given the part of the code where for loop is being used. CreatePatchDataSet function has the "for loop " which has an increment of pixel shift. Please help me out to convert this function into Cuda. I have provided the following code.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>
using namespace std;
using namespace cv;
#define PATCH_SIZE (5)
#define PIXEL_SHIFT (2)
void ConvertMat2DoubleArray(cv::Mat input, double* output)
{
for (int i = 0; i < input.rows; i++)
{
double *src = input.ptr<double>(i);
for (int j = 0; j < input.cols; j++)
{
output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
}
}
}
void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {
*num_of_patch_col = 0;
int len_nb = 0;
while (len_nb < width) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_col)++;
}
len_nb = 0;
*num_of_patch_row = 0;
while (len_nb < height) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_row)++;
}
*num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
}
void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int counter_row = 0;
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
for (int i = 0; i < height; i += pixel_shift) {
int counter_col = 0;
for (int j = 0; j < width; j += pixel_shift) {
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0,
cv::INTER_LANCZOS4);
double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
free(orgimageH);
free(FY);
return 0;
}
The results I got for first 10 values in CPU version:
patch numbers:
16129
238,240,240,235,237,230,227,229,228,227
I have tried to convert this function to Kernel function using cuda:. But it goes into the infinite loop. As I am very new to this CUDA field, could you please help me to find out the problem in the code ?
__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
int i = threadIdx.x + (blockDim.x*blockIdx.x);
int j = threadIdx.y + (blockDim.y*blockIdx.y);
while (i<height && j< width)
{
int counter_row = 0;
int counter_col = 0;
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
//
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
double *d_orgimageH;
gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));
double *d_FY;
gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH * num_of_patch_image));
gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH , sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH * num_of_patch_image, cudaMemcpyDeviceToHost));
// cout<<"Hello world";
free(orgimageH);
free(FY);
cudaFree(d_FY);
cudaFree(d_orgimageH);
return 0;
}
Image I have used: [1]: https://i.stack.imgur.com/Ywg7p.png
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
is outside the while loop in your kernel. As i and j never change inside the while loop, it isn't stopping. There could be more problems here, but this is the most prominent one.
EDIT: Another one that I found, is that you have only one while over both i and j instead of one for each. You should probably use for loops like in your CPU code:
for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x));
i < height;
i += pixel_shift * blockDim.x * gridDim.x) {
for (j = ...; j < ...; j += ...) {
/* ... */
}
}
EDIT 2:
I could imagine this to be a good idea:
for (counter_row = threadIdx.y + blockDim.y * blockIdx.y;
counter_row < num_of_patch_row;
counter_row += blockDim.y * gridDim.y) {
i = counter_row * pixel_shift;
if (i > height)
break;
for (counter_col = threadIdx.x + blockDim.x * blockIdx.x;
counter_col < num_of_patch_col;
counter_col += blockDim.x * gridDim.x) {
j = counter_col * pixel_shift;
if (j > width)
break;
/* ... */
}
}
I have also exchanged the x/y fields of the execution parameters between the inner and the outer loop, as it seemed more appropriate considering that the x field is continuous in warps (memory access benefits).
I am on MSVC 2019 with the default compiler. The code I am working on is a Mandelbrot image. Relevant bits of my code looks like:
#pragma omp parallel for
for (int y = 0; y < HEIGHT; y++)
{
for (int x = 0; x < WIDTH; x++)
{
unsigned int retVal = mandel(x_val + x_incr * x, y_val + y_incr * y);
mtest.setPixels(x, y,
static_cast<unsigned char>(retVal / 6),
static_cast<unsigned char>(retVal / 5),
static_cast<unsigned char>(retVal / 4));
}
}
All of the variables outside of the loop are constexpr, eliminating any dependencies. The mandel function does about 1000 iterations with each call. I would expect the outer loop to run on several threads but my msvc records each run at about 5-6 seconds with or without the omp directive.
Edit (The mandel function):
unsigned int mandel(long double x, long double y)
{
long double z_x = 0;
long double z_y = 0;
for (int i = 0; i < ITER; i++)
{
long double temp = z_x;
z_x = (z_x * z_x) - (z_y * z_y) + x;
z_y = 2 * temp * z_y + y;
if ((z_x * z_x + z_y * z_y) > 4)
return i;
}
return ITER; //ITER is a #define macro
}
Your mandel function has a vastly differing runtime cost depending on whether the if condition within the loop has been met. As a result, each iteration of your loop will run in a different time. By default omp uses static scheduling (i.e. break loop into N partitions). This is kinda bad, because you don't have a workload that fits static scheduling. See what happens when you use dynamic scheduling.
#pragma omp parallel for schedule(dynamic, 1)
for (int y = 0; y < HEIGHT; y++)
{
for (int x = 0; x < WIDTH; x++)
{
unsigned int retVal = mandel(x_val + x_incr * x, y_val + y_incr * y);
mtest.setPixels(x, y,
static_cast<unsigned char>(retVal / 6),
static_cast<unsigned char>(retVal / 5),
static_cast<unsigned char>(retVal / 4));
}
}
Also time to rule out the really dumb stuff.....
Have you included omp.h at least once in your program?
Have you enabled omp in the project settings?
IIRC, if you haven't done those two things, omp will be disabled under MSVC.
This is not an answer, but please do this:
unsigned int mandel(long double x, long double y)
{
long double z_x = 0;
long double z_y = 0;
long double z_x_squared = 0;
long double z_y_squared = 0;
for (int i = 0; i < ITER; i++)
{
long double temp = z_x;
z_x = z_x_squared - z_y_squared + x;
z_y = 2 * temp * z_y + y;
z_x_squared = z_x * z_x;
z_y_squared = z_y * z_u;
if ((z_x_squared + z_y_squared) > 4)
return i;
}
return ITER; //ITER is a #define macro
}
Also, try inverting the order of your two for loops.
I've modified a raytracer I wrote a while ago for educational purposes to take advantage of multiprocessing using OpenMP. However, I'm not seeing any profit from the parallelization.
I've tried 3 different approaches: a task-pooled environment (the draw_pooled() function), a standard OMP parallel nested for loop with image row-level parallelism (draw_parallel_for()), and another OMP parallel for with pixel-level parallelism (draw_parallel_for2()). The original, serial drawing routine is also included for reference (draw_serial()).
I'm running a 2560x1920 render on an Intel Core 2 Duo E6750 (2 cores # 2,67GHz each w/Hyper-Threading) and 4GB of RAM under Linux, binary compiled by gcc with libgomp. The scene takes an average of:
120 seconds to render in series,
but 196 seconds (sic!) to do so in parallel in 2 threads (the default - number of CPU cores), regardless of which of the three particular methods above I choose,
if I override OMP's default thread number with 4 to take HT into account, the parallel render times drop to 177 seconds.
Why is this happening? I can't see any obvious bottlenecks in the parallel code.
EDIT: Just to clarify - the task pool is only one of the implementations, please do read the question - scroll down to see the parallel fors. Thing is, they are just as slow as the task pool!
void draw_parallel_for(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for2(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
int x, y;
#pragma omp parallel for private(x, y) num_threads(4)
for (int xy = 0; xy < w * h; ++xy) {
x = xy % w;
y = xy / w;
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for3(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_serial(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
std::queue< std::pair<int, int> * > task_queue;
void draw_pooled(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
bool tasks_issued = false;
#pragma omp parallel shared(buf, tasks_issued, w, h) num_threads(4)
{
#pragma omp master
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
task_queue.push(new std::pair<int, int>(x, y));
}
tasks_issued = true;
}
while (true) {
std::pair<int, int> *coords;
#pragma omp critical(task_fetch)
{
if (task_queue.size() > 0) {
coords = task_queue.front();
task_queue.pop();
} else
coords = NULL;
}
if (coords != NULL) {
Scene::GetInstance().RenderPixel(coords->first, coords->second,
buf + (coords->second * w + coords->first) * 3);
delete coords;
} else {
#pragma omp flush(tasks_issued)
if (tasks_issued)
break;
}
}
}
write_png(buf, w, h, fname);
delete [] buf;
}
You have a critical section inside your innermost loop. In other words, you're hitting a synchronization primitive per pixel. That's going to kill performance.
Better split the scene in tiles and work one on each thread. That way, you have a longer time (a whole tile's worth of processing) between synchronizations.
If the pixels are independent you don't actually need any locking. You can just divide up the image into rows or columns and let the threads work on their own. For example, you could have each thread operate on every nth row (pseudocode):
for(int y = TREAD_NUM; y < h; y += THREAD_COUNT)
for(int x = 0; x < w; ++x)
render_pixel(x,y);
Where THREAD_NUM is a unique number for each thread such that 0 <= THREAD_NUM < THREAD_COUNT. Then after you join your threadpool, perform the png conversion.
There is always an performance overhead while creating threads. OMP Parallel inside a for loop will obviously generate lot of overhead. For example, in your code
void draw_parallel_for(int w, int h, const char *fname) {
for (int y = 0; y < h; ++y) {
// Here There is a lot of overhead
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
It can be re-written as
void draw_parallel_for(int w, int h, const char *fname) {
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
or
void draw_parallel_for(int w, int h, const char *fname) {
#pragma omp parallel num_threads(4)
for (int y = 0; y < h; ++y) {
#pragma omp for
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
By this way, you will eliminate the overhead
i am getting an error: invalid lvalue in assignment.
this is the only error with my program, it looks like a fatal compile time error regards on specially pthread.
i am trying to get the inputs in the runtime, using command line arguments, that's why i am getting an error, but previously i didn't get any error, when i run the program in static input initialized in the program itself.
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <time.h>
#include <sched.h>
#include <sys/types.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#define num_threads 8
pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
unsigned int width = 1500;
unsigned int height = 1500;
unsigned int max_iterations = 30000;
unsigned int **color = NULL;
double min_re;
double max_re;
double min_im;
double max_im;
double x_factor;
double y_factor;
unsigned int NUM_OF_THREADS;
int chunk = 10;
int total_sum = 0;
bool file_write()
{
FILE *fractal = fopen("mandelbrot_imagepthread.ppm","w+");
if(fractal != NULL)
{
fprintf(fractal,"P6\n");
fprintf(fractal,"# %s\n", "Mandelbrot_imagepthread.ppm");
fprintf(fractal,"%d %d\n", height, width);
fprintf(fractal,"255\n");
int y = 0, x = 0;
unsigned int R = 0, G = 0, B = 0;
for(x = 0; x < width; ++x)
{
for(y = 0; y < height; ++y)
{
R = (color[y][x]*10)%255;
G = 255-((color[y][x]*10)%255);
B = ((color[y][x]*10)-150)%255;
if(R == 10) R = 11;
if(G == 10) G = 11;
if(B == 10) B = 11;
putc(R, fractal);
putc(G, fractal);
putc(B, fractal);
}
}
fclose(fractal);
}
return true;
}
int method(int x, int y, int max_iterations, double max_im,double min_re,double x_factor, double y_factor)
{
double c_im = max_im - y*y_factor;
double c_re = min_re + x*x_factor;
double Z_re = c_re, Z_im = c_im;
unsigned int col = 0;
for(unsigned n=0; n<max_iterations; ++n)
{
double Z_re2 = Z_re*Z_re, Z_im2 = Z_im*Z_im;
if(Z_re2 + Z_im2 > 4)
{
col = n;
break;
}
Z_im = 2 * Z_re * Z_im + c_im;
Z_re = Z_re2 - Z_im2 + c_re;
}
return col;
}
void* method1(void* t)
{
double min_re = -2.0;
double max_re = 1.0;
double min_im = -1.2;
double max_im = min_im+(max_re-min_re)*height/width;
double x_factor = (max_re-min_re)/(width-1);
double y_factor = (max_im-min_im)/(height-1);
int x,y;
int sub_total = -1;
pthread_mutex_lock(&mut);
if(total_sum < height)
{
sub_total = total_sum;
total_sum = total_sum + chunk;
}
pthread_mutex_unlock(&mut);
while(sub_total > -1)
{
int start_point = sub_total;
int end_point = start_point + chunk;
for(y=start_point; y<end_point; y++)
{
for(x=0; x<width; ++x)
{
int m1;
uintptr_t m2;
m2 = (uintptr_t)t;
m1 = method(x,y,max_iterations,max_im,min_re,x_factor,y_factor);
if(m1)
{
color[x][y] = m1*40;
}
}
}
sub_total = -1;
pthread_mutex_lock(&mut);
if(total_sum < height)
{
sub_total = total_sum;
total_sum = total_sum + chunk;
}
pthread_mutex_unlock(&mut);
}
pthread_exit((void*)&t);
}
int main(int argc, char *argv[])
{
if(argc != 9)
{
printf("There is an error in the input given.\n");
return 0;
}
else
{
height = atoi(argv[1]);
width = atoi(argv[2]);
max_iterations = atoi(argv[3]);
min_re = atof(argv[4]);
max_re = atof(argv[5]);
min_im = atof(argv[6]);
max_im = atof(argv[7]);
num_threads = atoi(argv[8]);
}
color = (unsigned int**)malloc(height*sizeof(unsigned int*));
x_factor = (max_re-min_re)/(width-1);
y_factor = (max_im-min_im)/(height-1);
printf("height = %d\twidth = %d\tmaximum_iterations = %d\tminimum_x-value = %.2f\tmaximum_x-value = %.2f\tminimum_y-value = %.2f\tmaximum_y-value = %.2f\tno. of threads = %d\t\n",height,width,max_iterations,min_re,max_re,min_im,max_im,num_threads);
int x;
for(x = 0; x < height; x++)
{
color[x] = (unsigned int*)malloc(width*sizeof(unsigned int));
}
time_t ts,te;
time(&ts);
pthread_t t1[num_threads];
pthread_attr_t attr;
int l1;
void *att;
double value = 0.0;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
for(int i=0;i<num_threads;i++)
{
l1 = pthread_create(&t1[i], &attr, method1, (void *) i);
if(l1)
{
printf("There is some kind of error in thread creation: %d", l1);
exit(-1);
}
}
pthread_attr_destroy(&attr);
for(int i=0;i<num_threads;i++)
{
l1 = pthread_join(t1[i],&att);
if(l1)
{
printf("There is some kind of error in thread creation: %d", l1);
exit(-1);
}
double result = *(double *)att;
value += result;
}
time(&te);
double diff = difftime(te,ts);
file_write();
printf("Total Time elapsed: %.2f seconds\n",diff);
for(x = 0; x < height; x++)
{
free(color[x]);
}
free(color);
return 0;
pthread_exit(NULL);
}
The error here is that you define num_threads to be 8 with a #define directive instead declaring it as int!
Change #define num_threads 8 to int num_threads=8;
In general you should avoid #define directives because they are evil.
If you want to have a global constant variables declare it as static const rather than a #define. Those directives are substituted by the preprocessor to the following code and lead to the following (non-sense) code.
8 = atoi(argv[8])