I am on MSVC 2019 with the default compiler. The code I am working on is a Mandelbrot image. Relevant bits of my code looks like:
#pragma omp parallel for
for (int y = 0; y < HEIGHT; y++)
{
for (int x = 0; x < WIDTH; x++)
{
unsigned int retVal = mandel(x_val + x_incr * x, y_val + y_incr * y);
mtest.setPixels(x, y,
static_cast<unsigned char>(retVal / 6),
static_cast<unsigned char>(retVal / 5),
static_cast<unsigned char>(retVal / 4));
}
}
All of the variables outside of the loop are constexpr, eliminating any dependencies. The mandel function does about 1000 iterations with each call. I would expect the outer loop to run on several threads but my msvc records each run at about 5-6 seconds with or without the omp directive.
Edit (The mandel function):
unsigned int mandel(long double x, long double y)
{
long double z_x = 0;
long double z_y = 0;
for (int i = 0; i < ITER; i++)
{
long double temp = z_x;
z_x = (z_x * z_x) - (z_y * z_y) + x;
z_y = 2 * temp * z_y + y;
if ((z_x * z_x + z_y * z_y) > 4)
return i;
}
return ITER; //ITER is a #define macro
}
Your mandel function has a vastly differing runtime cost depending on whether the if condition within the loop has been met. As a result, each iteration of your loop will run in a different time. By default omp uses static scheduling (i.e. break loop into N partitions). This is kinda bad, because you don't have a workload that fits static scheduling. See what happens when you use dynamic scheduling.
#pragma omp parallel for schedule(dynamic, 1)
for (int y = 0; y < HEIGHT; y++)
{
for (int x = 0; x < WIDTH; x++)
{
unsigned int retVal = mandel(x_val + x_incr * x, y_val + y_incr * y);
mtest.setPixels(x, y,
static_cast<unsigned char>(retVal / 6),
static_cast<unsigned char>(retVal / 5),
static_cast<unsigned char>(retVal / 4));
}
}
Also time to rule out the really dumb stuff.....
Have you included omp.h at least once in your program?
Have you enabled omp in the project settings?
IIRC, if you haven't done those two things, omp will be disabled under MSVC.
This is not an answer, but please do this:
unsigned int mandel(long double x, long double y)
{
long double z_x = 0;
long double z_y = 0;
long double z_x_squared = 0;
long double z_y_squared = 0;
for (int i = 0; i < ITER; i++)
{
long double temp = z_x;
z_x = z_x_squared - z_y_squared + x;
z_y = 2 * temp * z_y + y;
z_x_squared = z_x * z_x;
z_y_squared = z_y * z_u;
if ((z_x_squared + z_y_squared) > 4)
return i;
}
return ITER; //ITER is a #define macro
}
Also, try inverting the order of your two for loops.
Related
I've been trying to create an openMP variant of the julia set, but I'm unable to create a coherent image when running more than one thread, I've been trying to solve what looks like a race condition but cannot find the error.
The offending output looks like the required output along with "scanlines" across the entirety of the picture.
I've attached the code as well if its not clear enough.
#include <iostream>
#include <math.h>
#include <fstream>
#include <sstream>
#include <omp.h>
#include <QtWidgets>
#include <QElapsedTimer>
using namespace std;
double newReal(int x, int imageWidth){
return 1.5*(x - imageWidth / 2)/(0.5 * imageWidth);
}
double newImaginary(int y, int imageHeight){
return (y - imageHeight / 2) / (0.5 * imageHeight);
}
int julia(double& newReal, double& newImaginary, double& oldReal, double& oldImaginary, double cRe, double cIm,int maxIterations){
int i;
for(i = 0; i < maxIterations; i++){
oldReal = newReal;
oldImaginary = newImaginary;
newReal = oldReal * oldReal - oldImaginary * oldImaginary + cRe;
newImaginary = 2 * oldReal * oldImaginary + cIm;
if((newReal * newReal + newImaginary * newImaginary) > 4) break;
}
return i;
}
int main(int argc, char *argv[])
{
int fnum=atoi(argv[1]);
int numThr=atoi(argv[2]);
// int imageHeight=atoi(argv[3]);
// int imageWidth=atoi(arg[4]);
// int maxIterations=atoi(argv[5]);
// double cRe=atof(argv[3]);
// double cIm=atof(argv[4]);
//double cRe, cIm;
int imageWidth=10000, imageHeight=10000, maxIterations=3000;
double newRe, newIm, oldRe, oldIm,cRe,cIm;
cRe = -0.7;
cIm = 0.27015;
string fname;
QElapsedTimer time;
QImage img(imageHeight, imageWidth, QImage::Format_RGB888);//Qimagetesting
img.fill(QColor(Qt::black).rgb());//Qimagetesting
time.start();
int i,x,y;
int r, gr, b;
#pragma omp parallel for shared(imageHeight,imageWidth,newRe,newIm) private(x,y,i) num_threads(3)
for(y = 0; y < imageHeight; y++)
{
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrt(i) % 256);
b = (i % 256);
img.setPixel(x, y, qRgb(r, gr, b));
}
}
//stringstream s;
//s << fnum;
//fname= "julia" + s.str();
//fname+=".png";
//img.save(fname.c_str(),"PNG", 100);
img.save("julia.png","PNG", 100);
cout<< "Finished"<<endl;
cout<<time.elapsed()/1000.00<<" seconds"<<endl;
}
As pointed in comments, you have two main problems:
newRe and newIm are shared, but should not be
r, gr and b's access is not specified (shared by default I think)
There is concurrent calls to QImage::setPixel
To correct this, do not hesitate to make a omp for loop nested in a omp parallel block.
Declare private variable just before the for loop:
To prevent concurrent calls to QImage::setPixel, since this function is not thread safe, you can put it in a critical region, with #pragma omp critical.
int main(int argc, char *argv[])
{
int imageWidth=1000, imageHeight=1000, maxIterations=3000;
double cRe = -0.7;
double cIm = 0.27015;
QElapsedTimer time;
QImage img(imageHeight, imageWidth, QImage::Format_RGB888);//Qimagetesting
img.fill(Qt::black);
time.start();
#pragma omp parallel
{
/* all folowing values will be private */
int i,x,y;
int r, gr, b;
double newRe, newIm, oldRe, oldIm;
#pragma omp for
for(y = 0; y < imageHeight; y++)
{
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrtf(i) % 256);
b = (i % 256);
#pragma omp critical
img.setPixel(x, y, qRgb(r, gr, b));
}
}
}
img.save("julia.png","PNG", 100);
cout<<time.elapsed()/1000.00<<" seconds"<<endl;
return 0;
}
To go further, you can save some cpu time replacing ::setPixel by ::scanLine:
#pragma omp for
for(y = 0; y < imageHeight; y++)
{
uchar *line = img.scanLine(y);
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrtf(i) % 256);
b = (i % 256);
*line++ = r;
*line++ = gr;
*line++ = b;
}
}
EDIT:
Since the julia set seems to have a central symetry around (0,0) point, you can perfom only half of calculus:
int half_heigt = imageHeight / 2;
#pragma omp for
// compute only for first half of image
for(y = 0; y < half_heigt; y++)
{
for(x = 0; x < imageWidth; x++)
{
newRe = newReal(x,imageWidth);
newIm = newImaginary(y,imageHeight);
i= julia(newRe, newIm, oldRe, oldIm, cRe, cIm, maxIterations);
r = (3*i % 256);
gr = (2*(int)sqrtf(i) % 256);
b = (i % 256);
#pragma omp critical
{
// set the point
img.setPixel(x, y, qRgb(r, gr, b));
// set the symetric point
img.setPixel(imageWidth-1-x, imageHeight-1-y, qRgb(r, gr, b));
}
}
}
I have implemented two different functions to round a double figure to integer.
Here is the first function
static inline int round_v1(double value)
{
int t;
__asm
{
fld value;
fistp t;
}
return t;
}
Here is the second function
static inline int round_v2(double value)
{
double intpart, fractpart;
fractpart = modf(value, &intpart);
if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
return (int)(value + (value >= 0 ? 0.5 : -0.5));
else
return (int)intpart;
}
Both functions can work well in single thread, but the second one cannot work int multi-thread (using openMP). The program just crash when I use the second one.
Here is the main code where the round_v1 or round_v2 function is called.
void
BilateralFilter_Invoker::doFilter() const
{
if (!src || !dst) return;
int i, j;
int src_width = width + (radius << 1);
omp_set_num_threads(2);
#pragma omp parallel for
for (i = 0; i < height; ++i)
{
unsigned char* pSrc = src + (i+radius)*src_step + radius;
unsigned char* pDst = dst + i*dst_step;
for (j = 0; j < width; ++j)
{
float sum = 0.f, wsum = 0.f;
int val0 = pSrc[j];
for (int k = 0; k < maxk; ++k)
{
int val = pSrc[j + space_offset[k]];
float w = space_weight[k] * color_weight[std::abs(val-val0)];
sum += val * w;
wsum += w;
}
//pDst[j] = (unsigned char)round_v2(sum / wsum);
pDst[j] = (unsigned char)round_v1(sum / wsum);
}
}
}
the variables src, dst, height, width, src_step, dst_step, radius, maxk, space_offset, space_weight, color_weight are member variables of class BilateralFilter_Invoker.
I respectively call round_v1 and round_v2 for test and program crashes only when round_v2 was called. I wonder whether the modf(double, double*) function may cause this problem. For further test, I comment this line
fractpart = modf(value, &intpart);
and replace it by
fractpart = intpart = value;
I run the program again and it did not crash again. I have no idea whether modf(double, double*) causes this problem. Or maybe there is something wrong in my code causes the problem rather than the modf(double, double*) function.
Notice that The operating system I use is Windows7 and the compiler is VC10.
You have made the most common mistake with OpenMP on SO. The iterator of your inner loop needs to be made private. You can either do
#pragma omp parallel for private(j)
or use loop initial declarations
for (int j = 0; j < width; ++j)
In fact, since you never use i or j outside of the loops they apply to there is no reason to declare them C89 style outside of the loops.
I want to keep track of total pixels and rays processed by a long running raytracing process. If I update the shared variables every iteration, the process will slow down noticeably because of synchronization. I'd like to keep track of the progress and still get accurate count results at the end. Is there a way to do this with OpenMP for loops?
Here's some code of the loop in question:
void Raytracer::trace(RenderTarget& renderTarget, const Scene& scene, std::atomic<int>& sharedPixelCount, std::atomic<int>& sharedRayCount)
{
int width = renderTarget.getWidth();
int height = renderTarget.getHeight();
int totalPixelCount = width * height;
#pragma omp parallel for schedule(dynamic, 4096)
for (int i = 0; i < totalPixelCount; ++i)
{
int x = i % width;
int y = i / width;
Ray rayToScene = scene.camera.getRay(x, y);
shootRay(rayToScene, scene, sharedRayCount); // will increment sharedRayCount
renderTarget.setPixel(x, y, rayToScene.color.clamped());
++sharedPixelCount;
}
}
Since you have a chunk size of 4096 for your dynamically scheduled parallel-for loop, why not use that as the granularity for amortizing the counter updates?
For example, something like the following might work. I didn't test this code and you probably need to add some bookkeeping for totalPixelCount%4096!=0.
Unlike the previous answer, this does not add a branch to your loop, other than the one implied by the loop itself, for which many processors have optimized instructions. It also does not require any extra variables or arithmetic.
void Raytracer::trace(RenderTarget& renderTarget, const Scene& scene, std::atomic<int>& sharedPixelCount, std::atomic<int>& sharedRayCount)
{
int width = renderTarget.getWidth();
int height = renderTarget.getHeight();
int totalPixelCount = width * height;
#pragma omp parallel for schedule(dynamic, 1)
for (int j = 0; j < totalPixelCount; j+=4096)
{
for (int i = j; i < (i+4096); ++i)
{
int x = i % width;
int y = i / width;
Ray rayToScene = scene.camera.getRay(x, y);
shootRay(rayToScene, scene, sharedRayCount);
renderTarget.setPixel(x, y, rayToScene.color.clamped());
}
sharedPixelCount += 4096;
}
}
It's not really clear why sharedPixelCount needs to be updated inside of this loop at all, since it is not referenced in the loop body. If this is correct, I suggest the following instead.
void Raytracer::trace(RenderTarget& renderTarget, const Scene& scene, std::atomic<int>& sharedPixelCount, std::atomic<int>& sharedRayCount)
{
int width = renderTarget.getWidth();
int height = renderTarget.getHeight();
int totalPixelCount = width * height;
int reducePixelCount = 0;
#pragma omp parallel for schedule(dynamic, 4096) \
reduction(+:reducePixelCount) \
shared(reducePixelCount)
for (int i = 0; i < totalPixelCount; ++i)
{
int x = i % width;
int y = i / width;
Ray rayToScene = scene.camera.getRay(x, y);
shootRay(rayToScene, scene, sharedRayCount);
renderTarget.setPixel(x, y, rayToScene.color.clamped());
++reducePixelCount; /* thread-local operation, not atomic */
}
/* The interoperability of C++11 atomics and OpenMP is not defined yet,
* so this should just be avoided until OpenMP 5 at the earliest.
* It is sufficient to reduce over a non-atomic type and
* do the assignment here. */
sharedPixelCount = reducePixelCount;
}
Here's an example on how to do it:
void Raytracer::trace(RenderTarget& renderTarget, const Scene& scene, std::atomic<int>& sharedPixelCount, std::atomic<int>& sharedRayCount)
{
int width = renderTarget.getWidth();
int height = renderTarget.getHeight();
int totalPixelCount = width * height;
int rayCount = 0;
int previousRayCount = 0;
#pragma omp parallel for schedule(dynamic, 1000) reduction(+:rayCount) firstprivate(previousRayCount)
for (int i = 0; i < totalPixelCount; ++i)
{
int x = i % width;
int y = i / width;
Ray rayToScene = scene.camera.getRay(x, y);
shootRay(rayToScene, scene, rayCount);
renderTarget.setPixel(x, y, rayToScene.color.clamped());
if ((i + 1) % 100 == 0)
{
sharedPixelCount += 100;
sharedRayCount += (rayCount - previousRayCount);
previousRayCount = rayCount;
}
}
sharedPixelCount = totalPixelCount;
sharedRayCount = rayCount;
}
It won't be 100% accurate while the loop is running, but the error is negligible. At the end exact values will be reported.
I am writing a function to change the values of the pixels in an image. The way it works is by splitting up the task of shading each pixel into multiple threads. For example if there are 4 threads then each one will shade every 4 pixels. What I find strange is that the threaded approach is about a 1/10 of a second slower than doing it in a single loop. I can't figure out why this is since I have a quad core CPU and there is no real synchronization involved between the threads. I would expect it to be about 4x faster minus a bit of overhead. Am I doing something wrong here?
Note that I set nthreads=1 to measure the single loop approach.
FYI raster is a pointer in the class which points to a dynamic array of pixels.
void RGBImage::shade(Shader sh, size_t sx, size_t sy, size_t ex, size_t ey)
{
validate();
if(ex == 0)
ex = width;
if(ey == 0)
ey = height;
if(sx < 0 || sx >= width || sx >= ex || ex > width || sy < 0 || sy >= height || sy >= ey
|| ey > height)
throw std::invalid_argument("Bounds Invalid");
size_t w = ex - sx;
size_t h = ey - sy;
size_t nthreads = std::thread::hardware_concurrency();
if(nthreads > MAX_THREADS)
nthreads = MAX_THREADS;
else if(nthreads < 1)
nthreads = 1;
size_t load_per_thread = w * h / nthreads;
if(load_per_thread < MIN_THREAD_LOAD)
nthreads = (w * h) / MIN_THREAD_LOAD;
clock_t start = clock();
if(nthreads > 1)
{
std::unique_ptr<std::thread[]> threads(new std::thread[nthreads]);
for(size_t i = 0; i < nthreads; i++)
threads[i] = std::thread([=]()
{
for(size_t p = i; p < (w * h); p += nthreads)
{
size_t x = sx + p % w;
size_t y = sy + p / w;
sh(raster[y * width + x], x, y);
}
});
for(size_t i = 0; i < nthreads; i++)
threads[i].join();
}
else
{
for(size_t p = 0; p < (w * h); ++p)
{
size_t x = sx + p % w;
size_t y = sy + p / w;
sh(raster[y * width + x], x, y);
}
}
std::cout << ((float)(clock() - start) / CLOCKS_PER_SEC) << std::endl;
}
I took some of the advice ans changed up my function.
void RGBImage::shade(Shader sh, bool threads)
{
validate();
clock_t c = clock();
if(threads)
{
int nthreads = std::thread::hardware_concurrency();
size_t pix = width * height;
if(nthreads < 1)
nthreads = 1;
else if(nthreads > MAX_THREADS)
nthreads = MAX_THREADS;
if(pix / nthreads < MIN_THREAD_LOAD)
nthreads = pix / MIN_THREAD_LOAD;
size_t pix_per_threads = pix / nthreads;
std::unique_ptr<std::thread[]> t(new std::thread[nthreads]);
for(int i = 0; i < nthreads; i++)
{
t[i] = std::thread([=]()
{
size_t offset = i * pix_per_threads;
size_t x = offset % width;
size_t y = offset / width;
sh(raster + offset, *this, x, y,
i == nthreads - 1 ? pix_per_threads + (width * height) % nthreads : pix_per_threads);
});
}
for(int i = 0; i < nthreads; i++)
t[i].join();
}
else
{
sh(raster, *this, 0, 0, width * height);
}
std::cout << ((float)(clock() - c) / CLOCKS_PER_SEC) << std::endl;
}
Now it runs about 10x faster but the threaded version is still slower.
What you have done is maximized contention between threads.
You want to minimize it.
Threads should work on a scanline at a time (or more). Divide your image into n blocks of roughly equal number of scanlines (left of image to right), and tell each thread to work on the nth block of scanlines.
std::vector<std::thread> threads;
threads.reserve(nthreads);
for(size_t i = 0; i < nthreads; i++) {
size_t v_start = (h*i)/nthreads;
size_t v_end = (h*(i+1))/nthreads;
threads.push_back(std::thread([=]()
{
for(size_t y = v_start; y < v_end; ++y)
{
for (size_t x = 0; x < w; ++x) {
sh(raster[y * width + x], x, y);
}
}
}));
}
for(auto&& thread:threads)
thread.join();
another approach is to grab the ppl (parallel patterns library) and use it. It will dynamically balance the number of threads based on the current load and hardware specs, and might use thread pooling to reduce thread startup costs.
A serious concern is your Shader sh. You do not want to be calling anything as expensive as a function pointer (or even more expensive, a std::function) on a per-pixel basis.
My general rule is that I write a "for each pixel" function that takes the pixel operation as a F&&, and pass it to a "for each scanline" function after wrapping the pixel-shader in an (in-header-file) scanline based operation. The cost of indirection is then reduced to once per scanline. In addition, the compiler may be able to optimize between pixel operations (say, doing SIMD), while a per-pixel call cannot be optimized this way.
A final problem with your "interleave" solution is it makes it impossible for the compiler to vectorize your code. Vectorization can easily give a 3-4x speedup.
Well that answer is pretty simple. The threaded solution was infact faster. It was just consuming more clock() time and the clock() function is not good for timing threads.
in C++ you can take advantage of those cores using parallelism or use amp (accelerated massive parallelism). My vote will be to do the latter.
sample amp project: http://austin.codeplex.com/
https://msdn.microsoft.com/en-us/library/hh265137.aspx
http://blogs.msdn.com/b/nativeconcurrency/archive/2012/08/30/learn-c-amp.aspx
I've modified a raytracer I wrote a while ago for educational purposes to take advantage of multiprocessing using OpenMP. However, I'm not seeing any profit from the parallelization.
I've tried 3 different approaches: a task-pooled environment (the draw_pooled() function), a standard OMP parallel nested for loop with image row-level parallelism (draw_parallel_for()), and another OMP parallel for with pixel-level parallelism (draw_parallel_for2()). The original, serial drawing routine is also included for reference (draw_serial()).
I'm running a 2560x1920 render on an Intel Core 2 Duo E6750 (2 cores # 2,67GHz each w/Hyper-Threading) and 4GB of RAM under Linux, binary compiled by gcc with libgomp. The scene takes an average of:
120 seconds to render in series,
but 196 seconds (sic!) to do so in parallel in 2 threads (the default - number of CPU cores), regardless of which of the three particular methods above I choose,
if I override OMP's default thread number with 4 to take HT into account, the parallel render times drop to 177 seconds.
Why is this happening? I can't see any obvious bottlenecks in the parallel code.
EDIT: Just to clarify - the task pool is only one of the implementations, please do read the question - scroll down to see the parallel fors. Thing is, they are just as slow as the task pool!
void draw_parallel_for(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for2(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
int x, y;
#pragma omp parallel for private(x, y) num_threads(4)
for (int xy = 0; xy < w * h; ++xy) {
x = xy % w;
y = xy / w;
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for3(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_serial(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
std::queue< std::pair<int, int> * > task_queue;
void draw_pooled(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
bool tasks_issued = false;
#pragma omp parallel shared(buf, tasks_issued, w, h) num_threads(4)
{
#pragma omp master
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
task_queue.push(new std::pair<int, int>(x, y));
}
tasks_issued = true;
}
while (true) {
std::pair<int, int> *coords;
#pragma omp critical(task_fetch)
{
if (task_queue.size() > 0) {
coords = task_queue.front();
task_queue.pop();
} else
coords = NULL;
}
if (coords != NULL) {
Scene::GetInstance().RenderPixel(coords->first, coords->second,
buf + (coords->second * w + coords->first) * 3);
delete coords;
} else {
#pragma omp flush(tasks_issued)
if (tasks_issued)
break;
}
}
}
write_png(buf, w, h, fname);
delete [] buf;
}
You have a critical section inside your innermost loop. In other words, you're hitting a synchronization primitive per pixel. That's going to kill performance.
Better split the scene in tiles and work one on each thread. That way, you have a longer time (a whole tile's worth of processing) between synchronizations.
If the pixels are independent you don't actually need any locking. You can just divide up the image into rows or columns and let the threads work on their own. For example, you could have each thread operate on every nth row (pseudocode):
for(int y = TREAD_NUM; y < h; y += THREAD_COUNT)
for(int x = 0; x < w; ++x)
render_pixel(x,y);
Where THREAD_NUM is a unique number for each thread such that 0 <= THREAD_NUM < THREAD_COUNT. Then after you join your threadpool, perform the png conversion.
There is always an performance overhead while creating threads. OMP Parallel inside a for loop will obviously generate lot of overhead. For example, in your code
void draw_parallel_for(int w, int h, const char *fname) {
for (int y = 0; y < h; ++y) {
// Here There is a lot of overhead
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
It can be re-written as
void draw_parallel_for(int w, int h, const char *fname) {
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
or
void draw_parallel_for(int w, int h, const char *fname) {
#pragma omp parallel num_threads(4)
for (int y = 0; y < h; ++y) {
#pragma omp for
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
By this way, you will eliminate the overhead