I'm new to Pthreads and c++ and trying to parallelize an image flipping program. Obviously it isnt working. I'm told I need to port some code from an Image class but not really sure what porting means. I just copied and pasted the code but I guess that's wrong.
I get the general idea. allocate the workload, intitialize the threads, create the threads, join the threads and define a callback function.
I'm not totally sure what the cells_per_thread should be. I'm pretty sure it should be the image width * height / threads. Does that seem correct?
I'm getting multiple errors when compiling with cmake.
its saying m_thread_number, getWidth, getHeight, getPixel, temp are not define in the scope. I assume thats because the Image class code isn't ported?
PthreadImage.cxx
//Declare a callabck fucntion for Horizontal flip
void* H_flip_callback_function(void* aThreadData);
PthreadImage PthreadImage::flipHorizontally() const
{
if (m_thread_number == 0 || m_thread_number == 1)
{
return PthreadImage(Image::flipHorizontally(), m_thread_number);
}
else
{
PthreadImage temp(getWidth(), getHeight(), m_thread_number);
//Workload allocation
//Create a vector of type ThreadData whcih is constructed at the top of the class under Struct ThreadData. Pass in the number of threads.
vector<ThreadData> p_thread_data(m_thread_number);
//create an integer to hold the last element. inizialize it as -1.
int last_element = -1;
//create an unsigned int to hold how many cells we need per thread. For the image we want the width and height divided by the number of threads.
unsigned int cells_per_thread = getHeight() * getWidth() / m_thread_number;
//Next create a variable to hold the remainder of the sum.
unsigned int remainder = getHeight() * getWidth() % m_thread_number;
//print the number of cells per thread to the console
cout << "Default number for cells per thread: " << cells_per_thread << endl;
//inizialize the threads with a for loop to interate through each thread and populate it
for (int i = 0; i < m_thread_number; i++)
{
//thread ids correspond with the for loop index values.
p_thread_data[i].thread_id = i;
//start is last element + 1 i.e -1 + 1 start = 0.
p_thread_data[i].start_id = ++last_element;
p_thread_data[i].end_id = last_element + cells_per_thread - 1;
p_thread_data[i].input = this;
p_thread_data[i].output = &temp;
//if the remainder is > thats 0 add 1 to the end them remove 1 remainder.
if (remainder > 0)
{
p_thread_data[i].end_id++;
--remainder;
}
//make the last element not = -1 but = the end of the threads.
last_element = p_thread_data[i].end_id;
//print to console what number then thread start and end on
cout << "Thread[" << i << "] starts with " << p_thread_data[i].start_id << " and stops on " << p_thread_data[i].end_id << endl;
}
//create the threads with antoher for loop
for (int i = 0; i < m_thread_number; i++)
{
pthread_create(&p_thread_data[i].thread_id, NULL, H_flip_callback_function, &p_thread_data[i]);
}
//Wait for each thread to complete;
for (int i = 0; i < m_thread_number; i++)
{
pthread_join(p_thread_data[i].thread_id, NULL);
}
return temp;
}
}
Callback function
//Define the callabck fucntion for Horizontal flip
void* H_flip_callback_function(void* aThreadData)
{
//convert void to Thread data
ThreadData* p_thread_data = static_cast<ThreadData*>(aThreadData);
int tempHeight = temp(getHeight());
int tempWidth = temp(getWidth());
for (int i = p_thread_data->start_id; i <= p_thread_data->end_id; i++)
{
// Process every row of the image
for (unsigned int j = 0; j < m_height; ++j)
{
// Process every column of the image
for (unsigned int i = 0; i < m_width / 2; ++i)
{
(*(p_thread_data->output))( i, j) = getPixel(m_width - i - 1, j);
(*(p_thread_data->output))(m_width - i - 1, j) = getPixel( i, j);
}
}
}
}
Image class
#include <sstream> // Header file for stringstream
#include <fstream> // Header file for filestream
#include <algorithm> // Header file for min/max/fill
#include <numeric> // Header file for accumulate
#include <cmath> // Header file for abs and pow
#include <vector>
#include "Image.h"
//-----------------
Image::Image():
//-----------------
m_width(0),
m_height(0)
//-----------------
{}
//----------------------------------
Image::Image(const Image& anImage):
//----------------------------------
m_width(anImage.m_width),
m_height(anImage.m_height),
m_p_image(anImage.m_p_image)
//----------------------------------
Image class code to be ported
//-----------------------------------
Image Image::flipHorizontally() const
//-----------------------------------
{
// Create an image of the right size
Image temp(getWidth(), getHeight());
// Process every row of the image
for (unsigned int j = 0; j < m_height; ++j)
{
// Process every column of the image
for (unsigned int i = 0; i < tempWidth / 2; ++i)
{
temp(i, j) = getPixel(tempWidth - i - 1, j);
temp(tempWidth - i - 1, j) = getPixel(i, j);
}
}
return 0;
}
I feel like its pretty close. Any help greatly appreciated!
EDIT
Ok, so this is the correct code for anyone wasting their time on this.
There was obviously a fair few things wrong.
I don't know why there was 3 for loops. There should be 2. 1 for Rows and 1 for columns.
The cells_per_thread should be pixels_per_thread and rows/threads as #Larry B suggested not ALL the pixels per thread.
You can use -> to get members of a pointer i.e setPixel(),getPixel` etc. Who knew that!?
There was a data structure that was pretty inportant for you guys but I forgot.
struct ThreadData
{
pthread_t thread_id;
unsigned int start_id;
unsigned int end_id;
const Image* input;
Image* output;
};
Correct Callback
void* H_flip_callback_function(void* aThreadData)
{
//convert void to Thread data
ThreadData* p_thread_data = static_cast<ThreadData*>(aThreadData);
int width = p_thread_data->input->getWidth();
// Process every row of the image
for (unsigned int j = p_thread_data->start_id; j <=p_thread_data->end_id; ++j)
}
// Process every column of the image
for (unsigned int i = 0; i < width / 2; ++i)
{
p_thread_data->output->setPixel(i,j, p_thread_data->input->getPixel(width - i - 1, j));
p_thread_data->output->setPixel(width - i - 1, j, p_thread_data->input->getPixel(i, j));
}
}
return 0;
}
So now this code compiles and flips.
Thanks!
The general strategy for porting single threaded code to a multi-thread version is essentially rewriting the existing code to divide the work into self contained units of work that you can hand off to a thread for execution.
With that in mind, I don't agree with your implementation of H_flip_callback_function:
void* H_flip_callback_function(void* aThreadData)
{
//convert void to Thread data
ThreadData* p_thread_data = static_cast<ThreadData*>(aThreadData);
// Create an image of the right size
PthreadImage temp(getWidth(), getHeight(), m_thread_number);
int tempHeight = temp(getHeight());
int tempWidth = temp(getWidth());
for (int i = p_thread_data->start_id; i <= p_thread_data->end_id; i++)
{
// Process every row of the image
for (unsigned int j = 0; j < tempHeight; ++j)
{
// Process every column of the image
for (unsigned int i = 0; i < tempWidth / 2; ++i)
{
temp(i, j) = getPixel(tempWidth - i - 1, j);
temp(tempWidth - i - 1, j) = getPixel(i, j);
}
}
}
}
At face value, it looks like all your threads will be operating on the whole image. If this is the case, there is no real difference between your single and multi-thread version as you're just doing the same work multiple times in the multi-thread version.
I would argue that the smallest self contained unit of work would be to horizontally flip a single row of the image. However, if you have less threads than the number of rows, then you could allocate (Num rows / Num threads) to each thread. Each thread would then flip the rows assigned to it and the main thread would collect the results and assemble the final image.
With regards to your build warnings and errors, you'll have to provide the complete source code, build settings, environment, etc..
Related
I`m trying to write merge sort with 2 threads.
I divide array into 2 pieces and sort each half with usual merge sort. After that I just merge two sorted parts.
Usual merge sort works correctly, and if I apply it to eash part without threads, it works correctly too.
I run a lof of tests on randomly generated short arrays, and there can be 2k of correct tests, but sometimes my multithread sort doesn`t work properly.
After sorting each half but before merging them, I check them. Sometimes the set of numbers in current part of array occurs to be different from orinigal set of numbers in that part before sorting, the numbers just appear from nowhere.
There must be some problem with threads, because there is no such problem without them.
As you can see, I made buffer with length = array.size() and I pass reference on it to functions. When merging two sorted arrays, this buffer is used.
Each buffer element is initialized with 0.
I`m sure that there is no shared data, because every function uses separated part of buffer. The correct work of usual merge sort supports that.
Please, help to understand, what is wrong with this way of using threads, I`m absolutely confused.
P. S. my code is supposed to execute sorting in N threads, not in 2, thats why I create array of threads. But even with 2 it doesnt work.
Multithread function:
void merge_sort_multithread(std::vector<int>& arr, std::vector<int>& buffer, unsigned int threads_count)
{
int length = arr.size();
std::vector<std::thread> threads;
// dividing array into nearly equal parts
std::vector<int> thread_from; // array with indexes of part`s start
std::vector<int> thread_length; // array with part`s length
make_parts(thread_from, thread_length, threads_count, length);
// start threads
for (int i = 0; i < threads_count; ++i)
{
threads.push_back(std::thread(merge_sort, std::ref(arr), std::ref(buffer),
thread_length[i], thread_from[i]));
}
// waiting for end of sorting
for (int i = 0; i < threads_count; ++i)
threads[i].join();
// ------- here I check each part and find mistakes, so next function is not important ----
merge_sorted_after_multithreading(arr, buffer, thread_from, thread_length, threads_count, 0);
}
Usual merge sort:
void merge_sort(std::vector<int>& arr, std::vector<int>& buffer, size_t length, int from)
{
if (length == 1)
{
return;
}
int length_left = length / 2;
int length_right = length - length_left;
// sorting each part
merge_sort(arr, buffer, length_left, from);
merge_sort(arr, buffer, length_right, from + length_left);
// merging sorted parts
merge_arrays(arr, buffer, length_left, length - length_left, from, from + length_left);
}
Merging two sorted arrays with buffer:
void merge_arrays(std::vector<int>& arr, std::vector<int>& buffer, size_t length_left, size_t length_right, int start_left, int start_right)
{
int idx_left, idx_right, idx_buffer;
idx_left = idx_right = idx_buffer = 0;
while ((idx_left < length_left) && (idx_right < length_right))
{
if (arr[start_left + idx_left] < arr[start_right + idx_right])
{
do {
buffer[idx_buffer] = arr[start_left + idx_left];
++idx_buffer;
++idx_left;
} while ((idx_left < length_left) && (arr[start_left + idx_left] < arr[start_right + idx_right]));
}
else
{
do {
buffer[idx_buffer] = arr[start_right + idx_right];
++idx_buffer;
++idx_right;
} while ((idx_right < length_right) && (arr[start_right + idx_right] < arr[start_left + idx_left]));
}
}
if (idx_left == length_left)
{
for (; idx_right < length_right; ++idx_right)
{
buffer[idx_buffer] = arr[start_right + idx_right];
++idx_buffer;
}
}
else
{
for (; idx_left < length_left; ++idx_left)
{
buffer[idx_buffer] = arr[start_left + idx_left];
++idx_buffer;
}
}
// copying result to original array
for (int i = 0; i < idx_buffer; ++i)
{
arr[start_left + i] = buffer[i];
}
}
Dividing array into separated parts:
void make_parts(std::vector<int>& thread_from, std::vector<int>& thread_length, unsigned int threads_count, size_t length)
{
int dlength = (length / threads_count);
int odd_length = length % threads_count;
int offset = 0;
for (int i = 0; i < threads_count; ++i)
{
if (odd_length > 0)
{
thread_length.push_back(dlength + 1);
--odd_length;
}
else
thread_length.push_back(dlength);
thread_from.push_back(offset);
offset += thread_length[i];
}
}
P.P.S. Each function except multithread sort was tested and works correctly
I have a question here about entry sustitution. Let's say we have a matrix (squared) of a fixed size MATRIX_SIZE (unsorted 2D Array), a list of numbers replacementPolicy and another list of number substitudeNUMBER. We loop over the matrix, and if the entry has the same value as the (first) element in the replacementPolicy, we remember the position i, and substitude this entry with the i-th element in substitudeNUMBER. It sounds a little bit complicated, the code is as follows:
void substitute_entry() {
// For each entry in the matrix
for (int column = 0; column < MATRIX_SIZE; ++column) {
for (int row = 0; row < MATRIX_SIZE; ++row) {
// Search for the entry in the original number list
// and replace it with corresponding the element in the substituted number list
int index = -1;
for (int i = 0; i < LIST_SIZE; i++) {
if (replacementPolicy[i] == MATRIX[row][column]) {
index = i;
}
}
MATRIX[row][column] = substitutedNUMBER[index];
}
}
}
However, I would expect to optimize this code in order to achieve a faster runtime. My first idea is to switch the for loop - first over columns and then over rows, but this does not affect the runtime significantly. My second thought is to use a better algorithm to replace the entries, but unfortunately I mess up when testing. Is there any better way to do so?
Thank you!
I think your loops are perfect for a multithreading solution, for example, using the OpenMP, and with its capabilities, you can expect a significant improvement in the performance. I've made a few changes to your code, as follows:
#include <iostream>
#include <chrono>
#include <omp.h>
#define MATRIX_SIZE 1000
#define LIST_SIZE 1000
int arr[MATRIX_SIZE][MATRIX_SIZE];
int replacementPolicy[LIST_SIZE];
int substitutedNUMBER[MATRIX_SIZE];
void substitute_entry() {
// For each entry in the matrix
#pragma omp parallel for
for (int column = 0; column < MATRIX_SIZE; ++column) {
#pragma omp parallel for
for (int row = 0; row < MATRIX_SIZE; ++row) {
// Search for the entry in the original number list
// and replace it with corresponding the element in the substituted number list
int index = -1;
for (int i = 0; i < LIST_SIZE; i++) {
if (replacementPolicy[i] == arr[row][column]) {
index = i;
}
}
arr[row][column] = substitutedNUMBER[index];
}
}
}
int main()
{
omp_set_num_threads(4);
for ( int i = 0; i<MATRIX_SIZE ; i++)
{
replacementPolicy[i] = i;
substitutedNUMBER[i] = i;
for ( int j=0; j<MATRIX_SIZE ; j++)
{
arr[i][j] = i+j;
}
}
auto start = std::chrono::high_resolution_clock::now();
substitute_entry();
auto end = std::chrono::high_resolution_clock::now();
uint64_t diff = std::chrono::duration_cast<std::chrono::microseconds>(end-start).count();
std::cerr << diff << '\n';
return 0;
}
you can comment out the 3,14,16, and 34 lines and have the single thread version of your code.
In this example with MATRIX_SIZE of 1000, and on my personal computer which has only four cores, the single thread version gets done in 3731737 us and the multithreaded version in 718039 us.
I need to compute a product vector-matrix as efficiently as possible. Specifically, given a vector s and a matrix A, I need to compute s * A. I have a class Vector which wraps a std::vector and a class Matrix which also wraps a std::vector (for efficiency).
The naive approach (the one that I am using at the moment) is to have something like
Vector<T> timesMatrix(Matrix<T>& matrix)
{
Vector<unsigned int> result(matrix.columns());
// constructor that does a resize on the underlying std::vector
for(unsigned int i = 0 ; i < vector.size() ; ++i)
{
for(unsigned int j = 0 ; j < matrix.columns() ; ++j)
{
result[j] += (vector[i] * matrix.getElementAt(i, j));
// getElementAt accesses the appropriate entry
// of the underlying std::vector
}
}
return result;
}
It works fine and takes nearly 12000 microseconds. Note that the vector s has 499 elements, while A is 499 x 15500.
The next step was trying to parallelize the computation: if I have N threads then I can give each thread a part of the vector s and the "corresponding" rows of the matrix A. Each thread will compute a 499-sized Vector and the final result will be their entry-wise sum.
First of all, in the class Matrix I added a method to extract some rows from a Matrix and build a smaller one:
Matrix<T> extractSomeRows(unsigned int start, unsigned int end)
{
unsigned int rowsToExtract = end - start + 1;
std::vector<T> tmp;
tmp.reserve(rowsToExtract * numColumns);
for(unsigned int i = start * numColumns ; i < (end+1) * numColumns ; ++i)
{
tmp.push_back(matrix[i]);
}
return Matrix<T>(rowsToExtract, numColumns, tmp);
}
Then I defined a thread routine
void timesMatrixThreadRoutine
(Matrix<T>& matrix, unsigned int start, unsigned int end, Vector<T>& newRow)
{
// newRow is supposed to contain the partial result
// computed by a thread
newRow.resize(matrix.columns());
for(unsigned int i = start ; i < end + 1 ; ++i)
{
for(unsigned int j = 0 ; j < matrix.columns() ; ++j)
{
newRow[j] += vector[i] * matrix.getElementAt(i - start, j);
}
}
}
And finally I modified the code of the timesMatrix method that I showed above:
Vector<T> timesMatrix(Matrix<T>& matrix)
{
static const unsigned int NUM_THREADS = 4;
unsigned int matRows = matrix.rows();
unsigned int matColumns = matrix.columns();
unsigned int rowsEachThread = vector.size()/NUM_THREADS;
std::thread threads[NUM_THREADS];
Vector<T> tmp[NUM_THREADS];
unsigned int start, end;
// all but the last thread
for(unsigned int i = 0 ; i < NUM_THREADS - 1 ; ++i)
{
start = i*rowsEachThread;
end = (i+1)*rowsEachThread - 1;
threads[i] = std::thread(&Vector<T>::timesMatrixThreadRoutine, this,
matrix.extractSomeRows(start, end), start, end, std::ref(tmp[i]));
}
// last thread
start = (NUM_THREADS-1)*rowsEachThread;
end = matRows - 1;
threads[NUM_THREADS - 1] = std::thread(&Vector<T>::timesMatrixThreadRoutine, this,
matrix.extractSomeRows(start, end), start, end, std::ref(tmp[NUM_THREADS-1]));
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
threads[i].join();
}
Vector<unsigned int> result(matColumns);
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
result = result + tmp[i]; // the operator+ is overloaded
}
return result;
}
It still works but now it takes nearly 30000 microseconds, which is almost three times as much as before.
Am I doing something wrong? Do you think there is a better approach?
EDIT - using a "lightweight" VirtualMatrix
Following Ilya Ovodov's suggestion, I defined a class VirtualMatrix that wraps a T* matrixData, which is initialized in the constructor as
VirtualMatrix(Matrix<T>& m)
{
numRows = m.rows();
numColumns = m.columns();
matrixData = m.pointerToData();
// pointerToData() returns underlyingVector.data();
}
Then there is a method to retrieve a specific entry of the matrix:
inline T getElementAt(unsigned int row, unsigned int column)
{
return *(matrixData + row*numColumns + column);
}
Now the execution time is better (approximately 8000 microseconds) but maybe there are some improvements to be made. In particular the thread routine is now
void timesMatrixThreadRoutine
(VirtualMatrix<T>& matrix, unsigned int startRow, unsigned int endRow, Vector<T>& newRow)
{
unsigned int matColumns = matrix.columns();
newRow.resize(matColumns);
for(unsigned int i = startRow ; i < endRow + 1 ; ++i)
{
for(unsigned int j = 0 ; j < matColumns ; ++j)
{
newRow[j] += (vector[i] * matrix.getElementAt(i, j));
}
}
}
and the really slow part is the one with the nested for loops. If I remove it, the result is obviously wrong but is "computed" in less than 500 microseconds. This to say that now passing the arguments takes almost no time and the heavy part is really the computation.
According to you, is there any way to make it even faster?
Actually you make a partial copy of matrix for each thread in extractSomeRows. It takes a lot of time.
Redesign it so that "some rows" become virtual matrix pointing at data located in original matrix.
Use vectorized assembly instructions for an architecture by making it more explicit that you want to multiply in 4's, i.e. for the x86-64 SSE2+ and possibly ARM'S NEON.
C++ compilers can often unroll the loop into vectorized code if you explicitly make an operation happen in contingent elements:
Simple and fast matrix-vector multiplication in C / C++
There is also the option of using libraries specifically made for matrix multipication. For larger matrices, it may be more efficient to use special implementations based on the Fast Fourier Transform, alternate algorithms like Strassen's Algorithm, etc. In fact, your best bet would be to use a C library like this, and then wrap it in an interface that looks similar to a C++ vector.
I'm doing an assignment that involves calculating pi with threads. I've done this using mutex and it works fine, but I would like to get this version working as well. Here is my code.
#include <iostream>
#include <stdlib.h>
#include <iomanip>
#include <vector>
#include <pthread.h>
using namespace std;
typedef struct{
int iterations; //How many iterations this thread is going to do
int offset; //The offset multiplier for the calculations (Makes sure each thread calculates a different part of the formula)
}threadParameterList;
vector<double> partialSumList;
void* pi_calc(void* param){
threadParameterList* _param = static_cast<threadParameterList*>(param);
double k = 1.0;
for(int i = _param->iterations * _param->offset + 1; i < _param->iterations * (_param->offset + 1); ++i){
partialSumList[_param->offset] += (double)k*(4.0/((2.0*i)*(2.0*i+1.0)*(2.0*i+2.0)));
k *= -1.0;
}
pthread_exit(0);
}
int main(int argc, char* argv[]){
//Error checking
if(argc != 3){
cout << "error: two parameters required [iterations][threadcount]" << endl;
return -1;
}
if(atoi(argv[1]) <= 0 || atoi(argv[2]) <= 0){
cout << "error: invalid parameter supplied - parameters must be > 0." << endl;
return -1;
}
partialSumList.resize(atoi(argv[2]));
vector<pthread_t> threadList (atoi(argv[2]));
vector<threadParameterList> parameterList (atoi(argv[2]));
int iterations = atoi(argv[1]),
threadCount = atoi(argv[2]);
//Calculate workload for each thread
if(iterations % threadCount == 0){ //Threads divide evenly
for(int i = 0; i < threadCount; ++i){
parameterList[i].iterations = iterations/threadCount;
parameterList[i].offset = i;
pthread_create(&threadList[i], NULL, pi_calc, ¶meterList[i]);
}
void* status;
for(int i = 0; i < threadCount; ++i){
pthread_join(threadList[i], &status);
}
}
else{ //Threads do not divide evenly
for(int i = 0; i < threadCount - 1; ++i){
parameterList[i].iterations = iterations/threadCount;
parameterList[i].offset = i;
pthread_create(&threadList[i], NULL, pi_calc, ¶meterList[i]);
}
//Add the remainder to the last thread
parameterList[threadCount].iterations = (iterations % threadCount) + (iterations / threadCount);
parameterList[threadCount].offset = threadCount - 1;
pthread_create(&threadList[threadCount], NULL, pi_calc, ¶meterList[threadCount]);
void* status;
for(int i = 0; i < threadCount-1; ++i){
pthread_join(threadList[i], &status);
cout << status << endl;
}
}
//calculate pi
double pi = 3.0;
for(int i = 0; i < partialSumList.size(); ++i){
pi += partialSumList[i];
}
cout << "Value of pi: " << setw(15) << setprecision(15) << pi << endl;
return 0;
}
The code works fine in most cases. There are certain combinations of parameters that cause me to get a double free or corruption error on return 0. For example, if I use the parameters 100 and 10 the program creates 10 threads and does 10 iterations of the formula on each thread, works fine. If I use the parameters 10 and 4 the program creates 4 threads that do 2 iterations on 3 threads and 4 on the 4th thread, works fine. However, if I use 5 and 3, the program will correctly calculate the value and even print it out, but I get the error immediately after. This also happens for 17 and 3, and 10 and 3. I tried 15 and 7, but then I get a munmap_chunk(): invalid pointer error when the threads are trying to be joined - although i think that's something for another question.
If I had to guess, it has something to do with pthread_exit deallocating memory and then the same memory trying to be deallocated again on return, since I'm passing the parameter struct as a pointer. I tried a few different things like creating a local copy and defining parameterList as a vector of pointers, but it didn't solve anything. I've also tried eraseing and clearing the vector before return but that didn't help either.
I see this issue:
You are writing beyond the vector's bounds:
vector<threadParameterList> parameterList (atoi(argv[2]));
//...
int threadCount = atoi(argv[2]);
//...
parameterList[threadCount].iterations = (iterations % threadCount) + (iterations / threadCount);
parameterList[threadCount].offset = threadCount - 1;
Accessing parameterList[threadCount] is out of bounds.
I don't see in the code where threadCount is adjusted, so it remains the same value throughout that snippet.
Tip: If the goal is to access the last item in a container, use vector::back(). It works all the time for non-empty vectors.
parameterList.back().iterations = (iterations % threadCount) + (iterations / threadCount);
parameterList.back().offset = threadCount - 1;
One thing I can see is you might be going past the end of the vector here:
for(int i = 0; i < partialSumList.capacity(); ++i)
capacity() returns how many elements the vector can hold. This can be more than the size() of the vector. You can change you call to capacity() to size() to make sure you don't go past the end of the vector
for(int i = 0; i < partialSumList.size(); ++i)
The second thing I spot is that when iterations % threadCount != 0 you have:
parameterList[threadCount].iterations = (iterations % threadCount) + (iterations / threadCount);
parameterList[threadCount].offset = threadCount - 1;
pthread_create(&threadList[threadCount], NULL, pi_calc, ¶meterList[threadCount]);
Which is writing past the end of the vector. Then when you join all of the threads you don't join the last thread as you do:
for(int i = 0; i < threadCount-1; ++i){
^^^ uh oh. we missed the last thread
pthread_join(threadList[i], &status);
cout << status << endl;
}
I'm trying to run an openmp realization of Dijkstra's algorithm which I downloaded here heather.cs.ucdavis.edu/~matloff/OpenMP/Dijkstra.c
If I add for example one more vertice from 5 to 6, so that the path from 0th goes through two vertices, my program fails to give me a correct result, saying that the distance between 0th and 6th is infinite :^(
What can be the reason?
#define LARGEINT 2<<30-1 // "infinity"
#define NV 6
// global variables, all shared by all threads by default
int ohd[NV][NV], // 1-hop distances between vertices
mind[NV], // min distances found so far
notdone[NV], // vertices not checked yet
nth, // number of threads
chunk, // number of vertices handled by each thread
md, // current min over all threads
mv; // vertex which achieves that min
void init(int ac, char **av)
{ int i,j;
for (i = 0; i < NV; i++)
for (j = 0; j < NV; j++) {
if (j == i) ohd[i][i] = 0;
else ohd[i][j] = LARGEINT;
}
ohd[0][1] = ohd[1][0] = 40;
ohd[0][2] = ohd[2][0] = 15;
ohd[1][2] = ohd[2][1] = 20;
ohd[1][3] = ohd[3][1] = 10;
ohd[1][4] = ohd[4][1] = 25;
ohd[2][3] = ohd[3][2] = 100;
ohd[1][5] = ohd[5][1] = 6;
ohd[4][5] = ohd[5][4] = 8;
for (i = 1; i < NV; i++) {
notdone[i] = 1;
mind[i] = ohd[0][i];
}
}
// finds closest to 0 among notdone, among s through e
void findmymin(int s, int e, int *d, int *v)
{ int i;
*d = LARGEINT;
for (i = s; i <= e; i++)
if (notdone[i] && mind[i] < *d) {
*d = ohd[0][i];
*v = i;
}
}
// for each i in [s,e], ask whether a shorter path to i exists, through
// mv
void updateohd(int s, int e)
{ int i;
for (i = s; i <= e; i++)
if (mind[mv] + ohd[mv][i] < mind[i])
mind[i] = mind[mv] + ohd[mv][i];
}
void dowork()
{
#pragma omp parallel // Note 1
{ int startv,endv, // start, end vertices for this thread
step, // whole procedure goes NV steps
mymd, // min value found by this thread
mymv, // vertex which attains that value
me = omp_get_thread_num(); // my thread number
#pragma omp single // Note 2
{ nth = omp_get_num_threads(); chunk = NV/nth;
printf("there are %d threads\n",nth); }
// Note 3
startv = me * chunk;
endv = startv + chunk - 1;
for (step = 0; step < NV; step++) {
// find closest vertex to 0 among notdone; each thread finds
// closest in its group, then we find overall closest
#pragma omp single
{ md = LARGEINT; mv = 0; }
findmymin(startv,endv,&mymd,&mymv);
// update overall min if mine is smaller
#pragma omp critical // Note 4
{ if (mymd < md)
{ md = mymd; mv = mymv; }
}
// mark new vertex as done
#pragma omp single
{ notdone[mv] = 0; }
// now update my section of ohd
updateohd(startv,endv);
#pragma omp barrier
}
}
}
int main(int argc, char **argv)
{ int i;
init(argc,argv);
dowork();
// back to single thread now
printf("minimum distances:\n");
for (i = 1; i < NV; i++)
printf("%d\n",mind[i]);
}
There are two problems here:
If the number of threads doesn't evenly divide the number of values, then this division of work
startv = me * chunk;
endv = startv + chunk - 1;
is going to leave the last (NV - nth*(NV/nth)) elements undone, which will mean the distances are left at LARGEINT. This can be fixed any number of ways; the easiest for now is to give all remaining work to the last thread
if (me == (nth-1)) endv = NV-1;
(This leads to more load imbalance than is necessary, but is a reasonable start to get the code working.)
The other issue is that a barrier has been left out before setting notdone[]
#pragma omp barrier
#pragma omp single
{ notdone[mv] = 0; }
This makes sure notdone is updated and updateohd() is started only after everyone has finished their findmymin() and updated md and mv.
Note that it's very easy to introduce errors into the original code you started with; the global variables used make it very difficult to reason about. John Burkardt has a nicer version of this same algorithm for teaching up on his website here, which is almost excessively well commented and easier to trace through.