I have a fixed-size 2D matrix with size W x H, each element in the matrix is a std::vector. The data is stored in vector of vectors with linearized index. I'm trying to find a way to concurrently fill the output vector. Here is some code to indicate what I'm trying to do.
#include <cmath>
#include <chrono>
#include <iostream>
#include <mutex>
#include <vector>
#include <omp.h>
struct Vector2d
{
double x;
double y;
};
double generate(double range_min, double range_max)
{
double val = (double)rand() / RAND_MAX;
return range_min + val * (range_max - range_min);
}
int main(int argc, char** argv)
{
(void)argc;
(void)argv;
// generate input data
std::vector<Vector2d> points;
size_t num = 10000000;
size_t w = 100;
size_t h = 100;
for (size_t i = 0; i < num; ++i)
{
Vector2d point;
point.x = generate(0, w);
point.y = generate(0, h);
points.push_back(point);
}
// output
std::vector<std::vector<Vector2d> > output(num, std::vector<Vector2d>());
std::mutex mutex;
auto start = std::chrono::system_clock::now();
#pragma omp parallel for
for (size_t i = 0; i < num; ++i)
{
const Vector2d point = points[i];
size_t x = std::floor(point.x);
size_t y = std::floor(point.y);
size_t id = y * w + x;
mutex.lock();
output[id].push_back(point);
mutex.unlock();
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n";
return 0;
}
The problem is the code is much slower with openmp enabled. I found some example to fill std::vector using reduction, but I don't know how to adapt it to vector of vectors. Any help is appreciate, thanks!
There are some things you could do to improve the performance:
I would preallocate the second vector holding the Vector2d class, because every time you push_back a new Vector2d and the capacity of the std::vector is exceeded, it is going to reallocate. So if you do not care having initialized Vector2ds in your std::vector I would simply use:
std::vector<std::vector<Vector2d> > output(num,
std::vector<Vector2d>(num, Vector2d(/*whatever goes in here*/)));
Then in your for loop, you coul access the elements in the second vector via operator[], which allows you to get rid of the lock.
#pragma omp parallel for
for (size_t i = 0; i < num; ++i)
{
const Vector2d point = points[i];
size_t x = std::floor(point(0));
size_t y = std::floor(point(1));
size_t id = y * w + x;
output[id][i] = num;
}
Though I'm not sure, the before-mentioned way works with what you want to do. Otherwise you could reserve the storage for each std::vector<Vector2d>, which would leave you with your initial loop:
std::vector<std::vector<Vector2d> > output(num, std::vector<Vector2d>());
for(int i = 0; i < num; ++i) {
output[i].reserve(num);
}
#pragma omp parallel for
for (size_t i = 0; i < num; ++i)
{
const Vector2d point = points[i];
size_t x = std::floor(point(0));
size_t y = std::floor(point(1));
size_t id = y * w + x;
mutex.lock();
output[id].push_back(point);
mutex.unlock();
}
Which means you get rid of the vector re-allocation, but you still have the mutex...
Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 1 year ago.
Improve this question
I am working on a project about multithreading. Here Operation is a class which contains a type, a key, a time and an answer.
Here is my code:
#include <cstdlib>
#include <fstream>
#include <string>
#include <iomanip>
#include <pthread.h>
#include <vector>
#include "block.h"
using namespace std;
std::vector<Operation> *data;
block_bloom_filter filter(10000000, 0.01);
int ans[30000000];
void *test(void *arg)
{
int thread_id = *((int *)arg);
for (auto &op : data[thread_id])
{
if (op.type == 1)
{
filter.insert(op);
}
else
{
filter.query(op);
}
}
return 0;
}
int main(int argc, char **argv)
{
int k = atoi(argv[1]);
int *op_num = new int[k];
data = new vector<Operation>[k];
for (int i = 0; i < k; i++)
{
string tmp = "data" + to_string(i + 1) + ".in";
const char *s = tmp.c_str();
ifstream fin;
fin.open(s);
fin >> op_num[i];
//data[i] = new Operation[op_num[i]];
for (int j = 0; j < op_num[i]; j++)
{
string tmp1;
fin >> tmp1;
if (tmp1 == "insert")
{
Operation tmp2;
tmp2.type = 1;
fin >> tmp2.key >> tmp2.time;
tmp2.ans = -1;
data[i].push_back(tmp2);
}
else
{
Operation tmp2;
tmp2.type = 2;
fin >> tmp2.key >> tmp2.time;
tmp2.ans = -1;
data[i].push_back(tmp2);
}
}
fin.close();
}
auto start = std::chrono::high_resolution_clock::now();
int num_threads = k;
pthread_t *threads = new pthread_t[num_threads];
//auto **threads = new thread *[num_threads];
//pthread_t *threads = new pthread_t[k];
/*for (int i = 0; i < num_threads; i++)
{
threads[i] = new thread(test, i);
}
for (int i = 0; i < num_threads; i++)
{
threads[i]->join();
}*/
for (int i = 0; i < k; i++)
{
pthread_create(&threads[i], NULL, test, (void *)&(i));
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
//std::cerr << "duration = " << duration.count() << "us" << std::endl;
double time_used = duration.count() / 1e3;
std::ofstream f_time("time.out");
f_time << std::fixed << std::setprecision(3) << time_used << std::endl;
f_time.close();
for (int i = 0; i < k; i++)
{
for (int j = 0; j < op_num[i]; j++)
{
ans[data[i][j].time - 1] = data[i][j].ans;
}
}
ofstream fout;
fout.open("result.out");
for (int i = 0; i < 30000000; i++)
{
if (ans[i] >= 0)
fout << ans[i] << endl;
}
fout.close();
delete[] data;
delete[] threads;
delete[] op_num;
//pthread_exit(NULL);
}
My code can compile, but when running it shows segmentation fault and can only generate time.out no result.out. I've been working on it for a long time but still do not know why. Hope someone can help me.
Below is block.h
#include <algorithm>
#include <chrono>
#include <cmath>
#include <ctime>
#include <fstream>
#include <iostream>
#include <numeric>
#include <string>
#include <vector>
#include "Headers/MurmurHash3.h"
#include "xxHash/xxhash.c"
#define M_LN2 0.69314718055994530942
using namespace std;
typedef std::vector<bool> bit_vector;
class Operation
{
public:
int type; // 1: insert, 2: query
char key[17];
int time;
int ans;
};
int str_len = 16;
int cache_size = 64;
int block_size = 512;
int key_num = 10000000;
int slot_num = 1 << 27;
int hash_num = int((double)slot_num / key_num * M_LN2);
int block_num = (slot_num + block_size - 1) / block_size;
class bloom_filter
{
uint32_t size; // Probable Number of elements in universe
double fpr; // False positive rate
int m; // optimal size of bloom filter
int k; // Number of hash functions
bit_vector bloom;
public:
int get_size() { return size; }
double get_fpr() { return fpr; }
bloom_filter(int n, double fpr)
{
this->size = n;
this->fpr = fpr;
this->m = ceil(
-((n * log(fpr)) /
pow(log(2), 2.0))); // Natural logarithm m = −n ln p/(ln 2)2
// cout << m<< "\n";
this->k = ceil(
(m / n) * log(2)); // Calculate k k = (m/n) ln 2 2-k ≈ 0.6185 m/n
// cout << k;
bloom.resize(m, false);
}
void insert(string S)
{
uint32_t *p = new uint32_t(1); // For storing Hash Vaue
const void *str = S.c_str(); // Convert string to C string to use as a
// parameter for constant void
int index;
// cout<<S.length()<<"\t"<<sizeof(str)<<"\n";
// cout<<S<<"\n";
for (int i = 0; i < k; i++)
{
// MurmurHash3_x64_128();
MurmurHash3_x86_32(str, S.length(), i + 1,
p); // String, String size
index = *p % m;
// cout<<*p<<"\t"<<index<<"\t";
bloom[index] = true;
}
// cout<<"\n";
// print();
}
/*void print()
{
for (int i = 0; i < bloom.size(); i++)
{
cout << bloom.at(i);
}
}*/
char query(string S)
{
uint32_t *p = new uint32_t(1); // For storing Hash Vaue
const void *str = S.c_str(); // Convert string to C string to use as a
// parameter for constant void
int index;
// cout << S.length() << "\t" << sizeof(str) << "\n";
// cout<<S<<"\n";
for (int i = 0; i < k; i++)
{
// MurmurHash3_x64_128();
MurmurHash3_x86_32(str, S.length(), i + 1,
p); // String, String size
index = *p % m;
// cout<<*p<<"\t"<<index<<"\t";
if (bloom[index] == false)
return 'N';
}
return 'Y';
}
};
class block_bloom_filter
{
int size; // Probable Number of elements in universe
double fpr; // False positive rate
int m; // optimal size of bloom filter
int k; // Number of hash functions
int s; // Number of bloom filters
bit_vector block_bloom;
int cache_line_size;
public:
int get_size() { return size; }
double get_fpr() { return fpr; }
block_bloom_filter(int n, double fpr)
{
this->size = n;
this->fpr = fpr;
this->m = ceil(
-((n * log(fpr)) /
pow(log(2), 2.0))); // Natural logarithm m = −n ln p/(ln 2)2
// cout << m << "\n";
this->k = ceil(
(m / n) * log(2)); // Calculate k k = (m/n) ln 2 2-k ≈ 0.6185 m/n
// cout << k<<"\n";
this->cache_line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE) * 8;
this->s =
ceil((double)m / cache_line_size); // Total number of Bloom Filters
// cout<<s<<"s valye\n";
block_bloom.resize(cache_line_size * s, false);
}
/*void insert(Operation &S)
{
int block_number;
int first_index, last_index;
int index;
uint32_t *p = new uint32_t(1); // For storing Hash Value
const void *str = S.key.c_str(); // Convert string to C string to use as a
// parameter for constant void
MurmurHash3_x86_32(str, sizeof(str), 1,
p); // String, String size//Find out block number
// if(s!=0)
block_number = *p % s;
first_index = block_number * cache_line_size;
for (int i = 1; i < k; i++)
{
// MurmurHash3_x64_128();
MurmurHash3_x86_32(str, S.key.length(), i + 1,
p); // String, String size
// cout<<*p<<"\n";
// cout<<"div="<<div << "\n";
index = (*p) % cache_line_size;
// cout<<index<<"\t";
// if(index>m) cout<<"\n"<<index<<"\tError detected\n";
// cout<<"\n"<<index<<"a\t\n";
// cout<<"\n"<<first_index<<"a\t\n";
// cout<<(index+first_index)<<"a\t\n";
block_bloom[index + first_index] = true;
}
// cout<<"\n";
// print();
}*/
XXH64_hash_t GetHash(const char *str)
{
return XXH3_64bits_withSeed(str, 16, /* Seed */ 123976235672331983ll);
}
void insert(Operation &s)
{
XXH64_hash_t hash = GetHash(s.key);
XXH64_hash_t hash1 = hash % m;
XXH64_hash_t hash2 = (hash / m) % m;
for (int i = 0; i < k; i++)
{
int pos = (hash1 + i * hash2) % m;
block_bloom[pos] = 1;
}
}
void query(Operation &s)
{
XXH64_hash_t hash = GetHash(s.key);
XXH64_hash_t hash1 = hash % m;
XXH64_hash_t hash2 = (hash / m) % m;
for (int i = 0; i < k; i++)
{
int pos = (hash1 + i * hash2) % m;
if (!block_bloom[pos])
{
s.ans = 0;
return;
}
}
s.ans = 1;
return;
}
};
for (int i = 0; i < k; i++)
{
pthread_create(&threads[i], NULL, test, (void *)&(i));
The third parameter to pthread_create(), the thread function's parameter, is a pointer to the loop variable. The thread function reads it, as follows:
void *test(void *arg)
{
int thread_id = *((int *)arg);
There are no guarantees whatsoever that this gets executed by the new execution thread before the parent execution thread increments i. When it comes to multiple execution threads, neither POSIX nor the C++ library gives you any guarantees as to the relative execution order of multiple threads.
All that pthread_create() guarantees you is that at some point in time later, which can before before or after pthread_create() returns, the new execution thread pops into existence and begins executing the thread function.
And it may very well be that one or more (if not all) execution threads finally begin executing, for real, after the for loop terminates and i gets destroyed. At which pointL when they do start executing, they will discover a pointer to a destroyed variable as their argument, and dereferencing it becomes undefined behavior.
Or, some of those execution threads get their gear running, at some point after they get created. By this time i's been incremented a couple of times already. So they both read the *(int *)arg, whose value is now -- who knows? And, just to make things interesting, both execution threads do this at the same time, and read the same value. At this point, the end result is already going to be garbage. It is clear that the intent here is for each execution thread getting a unique value for its parameter, but this very unlikely to happen here. There's nothing in the shown code that ensures that each execution threads actually gets its own unique thread_id.
Additionally, the original parent execution thread seems to assume that all the execution threads will all finish their job before the parent execution thread reads their results, and writes them out to a file.
Unfortunately, there's no code in the parent execution thread that appears to actually wait for all execution threads to finish. As soon as they're all started, it takes it on faith that they complete instantly, and it reads the partial results, and writes it out to a file:
auto stop = std::chrono::high_resolution_clock::now();
Well, the bad news here is that there's nothing that actually waits for all execution threads to actually stop, at this point. They're still running here. Even if the program manages to avoid crashing, the output results will be incomplete, and mostly junk.
ans[data[i][j].time - 1]
It appears that the value of .time here was originally read from the input file. There does not appear to be any bounds checking here. It's possible for this vector/array access to be out of bounds, resulting in an undefined behavior and a likely crash.
Also, another problem with the shown code: There are plenty of calls to new, but only some of those get deleted, resulting in multiple memory leaks. Inspecting the shown code, there is no clear reason to new anything, in the first place.
In conclusion, there are multiple problems with the shown code that result in undefined behavior, and any of them will be the reason for the observed crash. The shown approach is very much error-prone, and will require much more substantial work, and proper multi-threading support, and inter-thread sequencing, in order to get the sequence of all events happen in the correct order, across all the execution threads.
I am trying to add multi-threading in a C++ code. The target is the for loop inside the function. The objective is to reduce the execution time of the program. It takes 3.83 seconds for execution.
I have tried to add the command #pragma omp parallel for reduction(+:sum) in the inner loop (before the j for-loop) but it was not enough. It took 1.98 seconds. The aim is to decrease the time up to 0.5 seconds.
I made some research to increase the speed up and some people recommend the Strip Mining method for Vectorization for better results. However I do not know how to implement it yet.
Could someone know how to do it ?
The code is:
void filter(const long n, const long m, float *data, const float threshold, std::vector &result_row_ind) {
for (long i = 0; i < n; i++) {
float sum = 0.0f;
for (long j = 0; j < m; j++) {
sum += data[i*m + j];
}
if (sum > threshold)
result_row_ind.push_back(i);
}
std::sort(result_row_ind.begin(),
result_row_ind.end());
}
Thank you very much
When possible, you likely want to parallelize the outer loop. The simplest way to go about this in OpenMP is to do this:
#pragma omp parallel for
for (long i = 0; i < n; i++) {
float sum = 0.0f;
for (long j = 0; j < m; j++) {
sum += data[i*m + j];
}
if (sum > threshold) {
#pragma omp critical
result_row_ind.push_back(i);
}
}
std::sort(result_row_ind.begin(),
result_row_ind.end());
This works, and is probably a great deal faster than parallelizing the inner loop (launching a parallel region is expensive), but it uses a critical section for locking to prevent races. The race could also be avoided by using a user defined reduction over vectors with a reduction on that loop, if the number of threads is very large and the number of matching results is very small this might be slower, but otherwise it is likely notably faster. This is not quite right, the vector type is incomplete since it wasn't listed, but should be pretty close:
#pragma omp declare \
reduction(CatVec: std::vector<T>: \
omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
initializer(omp_priv=std::vector<T>())
#pragma omp parallel for reduction(CatVec: result_row_ind)
for (long i = 0; i < n; i++) {
float sum = 0.0f;
for (long j = 0; j < m; j++) {
sum += data[i*m + j];
}
if (sum > threshold) {
result_row_ind.push_back(i);
}
}
std::sort(result_row_ind.begin(),
result_row_ind.end());
If you have a C++ compiler with support for execution policies, you could try std::for_each with the execution policy std::execution::par to see if that helps. Example:
#include <iostream>
#include <vector>
#include <algorithm>
#if __has_include(<execution>)
# include <execution>
#elif __has_include(<experimental/execution_policy>)
# include <experimental/execution_policy>
#endif
// iterator to use with std::for_each
class iterator {
size_t val;
public:
using iterator_category = std::forward_iterator_tag;
using value_type = size_t;
using difference_type = size_t;
using pointer = size_t*;
using reference = size_t&;
iterator(size_t value=0) : val(value) {}
inline iterator& operator++() { ++val; return *this; }
inline bool operator!=(const iterator& rhs) const { return val != rhs.val; }
inline reference operator*() { return val; }
};
std::vector<size_t> filter(const size_t rows, const size_t cols, const float* data, const float threshold) {
std::vector<size_t> result_row_ind;
std::vector<float> sums(rows);
iterator begin(0);
iterator end(rows);
std::for_each(std::execution::par, begin, end, [&](const size_t& row) {
const float* dataend = data + (row+1) * cols;
float& sum = sums[row];
for (const float* dataptr = data + row * cols; dataptr < dataend; ++dataptr) {
sum += *dataptr;
}
});
// pushing moved outside the threaded code to avoid using mutexes
for (size_t row = 0; row < rows; ++row) {
if (sums[row] > threshold)
result_row_ind.push_back(row);
}
std::sort(result_row_ind.begin(),
result_row_ind.end());
return result_row_ind;
}
int main() {
constexpr size_t rows = 1<<15, cols = 1<<18;
float* data = new float[rows*cols];
for (int i = 0; i < rows*cols; ++i) data[i] = (float)i / (float)100000000.;
std::vector<size_t> res = filter(rows, cols, data, 10.);
std::cout << res.size() << "\n";
delete[] data;
}
I wanted to learn how threads work, and I tried to make a program, which would use 2 threads, to copy a picture (just to test my newly acquired threading skills) . But I bumped into an error, probably because my interval (created by the interval function) is only working ( I believe) with one dimensional arrays.How can I change my program , to correctly create intervals , which work on 2 dimensional arrays, such as pictures ?
#include <iostream>
#include <vector>
#include <time.h>
#include <thread>
#include <mutex>
#include <png++/png.hpp>
std::mutex my_mutex;
std::vector<int> interval(int max, int n_threads)
{
std::vector<int> intervallum;
int ugras = max / n_threads;
int maradek = max % n_threads;
int n1 = 0;
int n2;
intervallum.push_back(n1);
for (int i = 0; i < n_threads; i++)
{
n2 = n1 + ugras;
if (i == n_threads - 1)
n2 += maradek;
intervallum.push_back(n2);
n1 = n2;
}
return intervallum;
}
void create_image(png::image<png::rgb_pixel> image, png::image<png::rgb_pixel> new_image, int start, int end)
{
std::lock_guard<std::mutex> lock(my_mutex);
for (int i = start; i < end; i++)
for (int j = start; j < end; j++)
{
new_image[i][j].red = image[i][j].red;
new_image[i][j].blue = image[i][j].blue;
new_image[i][j].green = image[i][j].green;
}
}
int main()
{
png::image<png::rgb_pixel> png_image("mandel.png");
int image_size = png_image.get_width() * png_image.get_height();
png::image<png::rgb_pixel> new_image(png_image.get_width(), png_image.get_height());
time_t start, end;
time(&start);
int size = 2;
std::vector<std::thread> threads;
std::vector<int> stuff_interval = interval(image_size, size);
for (int i = 0; i < size-1; i++)
threads.push_back(std::thread(create_image, std::ref(png_image), std::ref(new_image), stuff_interval[i], stuff_interval[i + 1]));
for (auto& i : threads)
i.join();
create_image(png_image,new_image,stuff_interval[size-2],stuff_interval[size-1]);
new_image.write("test.png");
time(&end);
std::cout << (start - end) << std::endl;
return 0;
}
Okay , I found a way around it (this way I am not getting segmentation error, but it does not copy the image correctly, the new image is fully black, here is the code :
EDIT : seems like, I was passing wrong the pictures, that is the reason why the picture was black.
#include <iostream>
#include <vector>
#include <time.h>
#include <thread>
#include <mutex>
#include <png++/png.hpp>
std::mutex my_mutex;
std::vector<int> interval(int max, int n_threads)
{
std::vector<int> intervallum;
int ugras = max / n_threads;
int maradek = max % n_threads;
int n1 = 0;
int n2;
intervallum.push_back(n1);
for (int i = 0; i < n_threads; i++)
{
n2 = n1 + ugras;
if (i == n_threads - 1)
n2 += maradek;
intervallum.push_back(n2);
n1 = n2;
}
return intervallum;
}
void create_image(png::image<png::rgb_pixel>& image, png::image<png::rgb_pixel>& new_image, int start, int end)
{
std::lock_guard<std::mutex> lock(my_mutex);
for (int i = start; i < end; i++)
for (int j = 0; j < image.get_height(); j++)
{
new_image[i][j].red = image[i][j].red;
new_image[i][j].blue = image[i][j].blue;
new_image[i][j].green = image[i][j].green;
}
}
int main()
{
png::image<png::rgb_pixel> png_image("mandel.png");
int image_size = png_image.get_width() * png_image.get_height();
png::image<png::rgb_pixel> new_image(png_image.get_width(), png_image.get_height());
time_t start, end;
time(&start);
int size = 2;
std::vector<std::thread> threads;
std::vector<int> stuff_interval = interval(png_image.get_width(), size);
new_image.write("test2.png");
for (int i = 0; i < size - 1; i++)
threads.push_back(std::thread(create_image, std::ref(png_image), std::ref(new_image), stuff_interval[i], stuff_interval[i + 1]));
for (auto &i : threads)
i.join();
create_image(std::ref(png_image), std::ref(new_image), stuff_interval[size - 1], stuff_interval[size]);
new_image.write("test.png");
time(&end);
std::cout << (start - end) << std::endl;
return 0;
}
I have a program that computes the matrix product x'Ay repeatedly. Is it better practice to compute this by making calls to MKL's blas, i.e. cblas_dgemv and cblas_ddot, which requires allocating memory to a temporary vector, or is better to simply take the sum of x_i * a_ij * y_j? In other words, does MKL's blas theoretically add any value?
I benchmarked this for my laptop. There was virtually no difference in each of the tests, other than g++_no_blas performed twice as poorly as the other tests (why?). There was also no difference between O2, O3 and Ofast.
g++_blas_static 57ms
g++_blas_dynamic 58ms
g++_no_blas 100ms
icpc_blas_static 57ms
icpc_blas_dynamic 58ms
icpc_no_blas 58ms
util.h
#ifndef UTIL_H
#define UTIL_H
#include <random>
#include <memory>
#include <iostream>
struct rng
{
rng() : unif(0.0, 1.0)
{
}
std::default_random_engine re;
std::uniform_real_distribution<double> unif;
double rand_double()
{
return unif(re);
}
std::unique_ptr<double[]> generate_square_matrix(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N * N]);
for (unsigned i = 0; i < N; ++i)
{
for (unsigned j = 0; j < N; ++j)
{
p.get()[i*N + j] = rand_double();
}
}
return p;
}
std::unique_ptr<double[]> generate_vector(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N]);
for (unsigned i = 0; i < N; ++i)
{
p.get()[i] = rand_double();
}
return p;
}
};
#endif // UTIL_H
main.cpp
#include <iostream>
#include <iomanip>
#include <memory>
#include <chrono>
#include "util.h"
#include "mkl.h"
double vtmv_blas(double* x, double* A, double* y, const unsigned n)
{
double temp[n];
cblas_dgemv(CblasRowMajor, CblasNoTrans, n, n, 1.0, A, n, y, 1, 0.0, temp, 1);
return cblas_ddot(n, temp, 1, x, 1);
}
double vtmv_non_blas(double* x, double* A, double* y, const unsigned n)
{
double r = 0;
for (unsigned i = 0; i < n; ++i)
{
for (unsigned j = 0; j < n; ++j)
{
r += x[i] * A[i*n + j] * y[j];
}
}
return r;
}
int main()
{
std::cout << std::fixed;
std::cout << std::setprecision(2);
constexpr unsigned N = 10000;
rng r;
std::unique_ptr<double[]> A = r.generate_square_matrix(N);
std::unique_ptr<double[]> x = r.generate_vector(N);
std::unique_ptr<double[]> y = r.generate_vector(N);
auto start = std::chrono::system_clock::now();
const double prod = vtmv_blas(x.get(), A.get(), y.get(), N);
auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start);
std::cout << "Result: " << prod << std::endl;
std::cout << "Time (ms): " << duration.count() << std::endl;
GCC no blas is poor because it does not use vectorized SMID instructions, while others all do. icpc will auto-vectorize you loop.
You don't show your matrix size, but generally gemv is memory bound. As the matrix is much larger than a temp vector, eliminating it may not be able to increase the performance a lot.
I have a vector of objects obj (of class Holder) with N elements with members like x and y which are also vectors of double type with M elements. I would like to write a text file creating an MxN matrix from this. I have tried lots of different things to no avail up to now.
vector<Holder> obj(N);
void savedata(string filename, vector<Holder> obj, int M, int N) {
ofstream out(filename);
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
out << obj[i][j] << "\t" << endl;
}
}
}
But this just takes the last set of values. How can I create such an MxN matrix where rows are from the object member vector x and columns are from the object vector itself?
Thank you in advance.
--
The bigger version of the code is as follows:
//
//
#include <iostream>
#include <cmath>
#include <fstream>
#include <string>
#include <vector>
#include <random>
using namespace std;
typedef vector< vector<double> > Matrix;
// Particles making up the cell
class Particle{
public:
double x; // x position
double y; // y position
double vx; // velocity in the x direction
double vy; // velocity in the y direction
double Fx; // force in the x direction
double Fy; // force in the y direction
// Default constructor
Particle()
: x(0.0),y(0.0),vx(0.0),vy(0.0),Fx(0.0),Fy(0.0){
}
};
// Holder for storing data
class HoldPar{
public:
vector<double> x;
vector<double> y;
vector<double> vx;
vector<double> vy;
// Default constructor
HoldPar()
: x(0.0),y(0.0),vx(0.0),vy(0.0){
}
// Add elements to vectors
void add_Xelement(double a) {
x.push_back(a);
}
void add_Yelement(double a) {
y.push_back(a);
}
void add_VXelement(double a) {
vx.push_back(a);
}
void add_VYelement(double a) {
vy.push_back(a);
}
};
int main() {
// Initialization of x, v and F
const float pi = 3.14;
int N = 30; // Number of 'particles' that make up the cell
float theta = 2*pi/N; // Angle between two particles in radians
float x0 = 0; // Center of the cell [x]
float y0 = 0; // Center of the cell [y]
float R = 5e-6; // Radius of the cell
vector<Particle> particles(N); // particles
// Assigning the initial points onto the circle
for(int i = 0; i < N; i++) {
particles[i].x = x0 + R*cos(theta*i);
particles[i].y = y0 + R*sin(theta*i);
}
float k = 4.3e-7; // Spring constant connecting the particles
float m = 2e-8; // Mass of the particles
// Calculating the initial spring force between the particles on the cell
particles[0].Fx = -k*(particles[1].x - particles[N].x);
particles[0].Fy = -k*(particles[1].y - particles[N].y);
for(int i = 1; i < N-1; i++) {
particles[i].Fx = -k*(particles[i+1].x - particles[i-1].x);
particles[i].Fy = -k*(particles[i+1].y - particles[i-1].y);
}
particles[N].Fx = -k*(particles[0].x - particles[N-1].x);
particles[N].Fy = -k*(particles[0].y - particles[N-1].y);
// Initial velocities are given to each particle randomly from a Gaussian distribution
random_device rdx; // Seed
default_random_engine generatorx(rdx()); // Default random number generator
random_device rdy; // Seed
default_random_engine generatory(rdy()); // Default random number generator
normal_distribution<float> distributionx(0,1); // Gaussian distribution with 0 mean and 1 variance
normal_distribution<float> distributiony(0,1); // Gaussian distribution with 0 mean and 1 variance
for(int i = 0; i < N; i++) {
float xnumber = distributionx(generatorx);
float ynumber = distributiony(generatory);
particles[i].vx = xnumber;
particles[i].vy = ynumber;
}
// Molecular dynamics simulation with velocity Verlet algorithm
// 'Old' variables
vector<Particle> particles_old(N);
for(int i = 0; i < N; i++) {
particles_old[i].x = particles[i].x;
particles_old[i].y = particles[i].y;
particles_old[i].vx = particles[i].vx;
particles_old[i].vy = particles[i].vy;
particles_old[i].Fx = particles[i].Fx;
particles_old[i].Fy = particles[i].Fy;
}
// Sampling variables
int sampleFreq = 2;
int sampleCounter = 0;
// MD variables
float dt = 1e-4;
float dt2 = dt*dt;
float m2 = 2*m;
int MdS = 1e+5; // Molecular dynamics step number
// Holder variables
vector<HoldPar> particles_hold(N);
// MD
for(int j = 0; j < MdS; j++) {
// Update x
for(int i = 0; i < N; i++) {
particles[i].x = particles_old[i].x + dt*particles_old[i].vx + dt2*particles_old[i].Fx/m2;
particles[i].y = particles_old[i].y + dt*particles_old[i].vy + dt2*particles_old[i].Fy/m2;
}
// Update F
particles[0].Fx = -k*(particles[1].x - particles[N].x);
particles[0].Fy = -k*(particles[1].y - particles[N].y);
for(int i = 1; i < N-1; i++) {
particles[i].Fx = -k*(particles[i+1].x - particles[i-1].x);
particles[i].Fy = -k*(particles[i+1].y - particles[i-1].y);
}
particles[N].Fx = -k*(particles[0].x - particles[N-1].x);
particles[N].Fy = -k*(particles[0].y - particles[N-1].y);
// Update v
for(int i = 0; i < N; i++) {
particles[i].vx = particles_old[i].vx + dt*(particles_old[i].Fx + particles[i].Fx)/m2;
particles[i].vy = particles_old[i].vy + dt*(particles_old[i].Fy + particles[i].Fy)/m2;
}
// Copy new variables to old variables
for(int i = 0; i < N; i++) {
particles_old[i].x = particles[i].x;
particles_old[i].y = particles[i].y;
particles_old[i].vx = particles[i].vx;
particles_old[i].vy = particles[i].vy;
particles_old[i].Fx = particles[i].Fx;
particles_old[i].Fy = particles[i].Fy;
}
// Store variables
if(j % sampleFreq == 0) {
for(int i = 0; i < N; i++) {
particles_hold[i].add_Xelement( particles[i].x );
particles_hold[i].add_Yelement( particles[i].y );
particles_hold[i].add_VXelement( particles[i].vx );
particles_hold[i].add_VYelement( particles[i].vy );
}
sampleCounter += 1;
}
}
//* End of molecular dynamics simulation
}
//
//*
//
Essentially I'm trying to write a txt file where particles_hold elements (from 1 to N) are columns and members of particles_hold elements like x (from 1 to some value M) are rows.
If you mean visually then the way is put endl or "\n" to the outer loop and remove endl from inner loop.But i do not know anythig about your Holder object and if you have [] operator defined there that is the answer.
vector<Holder> obj(N);
void savedata(string filename, vector<Holder> obj, int M, int N) {
ofstream out(filename);
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
out << obj[i][j] << "\t";
}
out<< "\n";
}
}
Your method is ok, however, made some minor change so that you have M lines, each lines represent obj[i], i = 0.. M-1. So, each column (jth index) is printed as tab separated in each line
vector<Holder> obj(N);
void savedata(string filename, vector<Holder> obj, int M, int N) {
ofstream out(filename);
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
out << obj[i][j] << "\t";
}
out << endl;
}
}