I have the following code where I am using both libraries Eigen and FFTW in C++. I am aware of the unsuported FFT that Eigen has, but I am using the FFTW library for additional features. So, I am trying to take the FFT with a C++ array and then map the output to a complex Eigen matrix (to perform some linear algebra operations on it). I have the code:
struct cplx_buffer
{
fftw_complex* a;
int rows;
int cols;
fftw_complex& operator()(int i, int j) const { return a[i * cols + j]; }
};
struct real_buffer
{
double* a;
int rows;
int cols;
double& operator()(int i, int j) const { return a[i * cols + j]; }
};
int main(){
static const int nx = 10;
static const int ny = 10;
static const int nyk = ny/2 + 1;
static const int mm = nx* 3/2;
cplx_buffer outW = my_fftw_allocate_cplx((ny+1), mm);
real_buffer inW = my_fftw_allocate_real((ny+1), mm);
//initialize the input
for (int i = 0; i < ny+1; i++){
for (int j = 0; j < mm; j++){
inW(i,j) = //expression
}
}
//Take FFT of rows
{ // Transform all the rows
fftw_execute(fftw_plan_many_dft_r2c(1, &nx, inW.rows, inW.a, &inW.cols, 1, inW.cols, outW.a, &inW.cols, 1, outW.cols, FFTW_ESTIMATE));
}
Eigen::Map<Eigen::MatrixXcd, Eigen::Unaligned> invnek(*reinterpret_cast<fftw_complex*>(&outW),(ny+1),mm); //ERROR
}
The error:
no instance of constructor "Eigen::Map<PlainObjectType, MapOptions, StrideType>::Map [with PlainObjectType=Eigen::MatrixXcd, MapOptions=0, StrideType=Eigen::Stride<0, 0>]" matches the argument list
How can I reinterpret_cast here correctly?
Does accessing the same array's different elements create a data race?
I have a "Matrix" wrapper class for an array with matrix interface, and i wrote a parallel multiplication by a scalar function for it.
I use CTPL library for thread pools.
I know that writing from a thread into an array cell passed by reference is not a data race (please correct me if i'm wrong) so i decided to pass a cell from the array to the function so i can write multiplication result into the cell itself, not by passing the reference to an array and the index, so i can avoid a data race.
I ran the function 10k times and the results did not differ even once, but a sanitizer i use ("-fsanitize=thread -fPIE -pie -g" in Cmake flags) still alerts me of a data race on the line where i create the thread pool.
Is the sanitizer mistaken or am i really experiencing a data race somewhere?
Here are the pieces of code, relevant to the prolem:
Wrapper:
class Matrix {
protected:
int width;
int height;
double* matrix;
public:
Matrix(int m, int n);
Matrix(int m, int n, const std::vector<double>& values);
int get_width() {
return width;
}
int get_height() {
return height;
}
double get_element(int row_num, int col_num);
void set_element(int row_num, int col_num, double el);
double* get_cell_ref(int row_num, int col_num);
};
Method implementations:
Matrix::Matrix(int m, int n) {
assert(m > 0 && n > 0);
matrix = new double[m * n]{0};
width = n;
height = m;
}
Matrix::Matrix(int m, int n, const std::vector<double>& values) {
assert(m > 0 && n > 0 && values.size() == m * n);
matrix = new double[m * n];
width = n;
height = m;
for (int i = 0; i < m * n; ++i) {
matrix[i] = values[i];
}
}
double Matrix::get_element(int row_num, int col_num) {
assert(check_valid(row_num, col_num, get_width(), get_height()));
return matrix[col_num + get_width() * row_num];
}
void Matrix::set_element(int row_num, int col_num, double el) {
assert(check_valid(row_num, col_num, get_width(), get_height()));
matrix[col_num + row_num * get_width()] = el;
}
double* Matrix::get_cell_ref(int row_num, int col_num) {
int idx = col_num + get_width() * row_num;
return &matrix[idx];
}
The function that supposedly has a data race:
Matrix* scalar_multiply_parallel(Matrix* a, double mul, int threadN) {
auto* b = new Matrix(a->get_height(), a->get_width());
ctpl::thread_pool thr_pool(threadN);
std::vector<std::future<void>> futures(a->get_height() * a->get_width());
for (int i =0; i < a->get_height(); i++) {
for (int j =0; j < a->get_width(); j++) {
int idx = j + a->get_width() * i;
auto util = [&a, &b, i, j, mul](int) {
//b->set_element(i, j, a->get_element(i, j) * mul);
double *cell;
cell = b->get_cell_ref(i, j);
*cell = a->get_element(i, j) * mul;
};
futures[idx] = thr_pool.push(util);
}
}
for (auto& f: futures) {
f.get();
}
return b;
}
I have been working to develop a radix select using CUDA which utilizes k smallest element to sort given number of elements. The main idea behind this radix select is that is scans through 32 bit integer starting from its MSB to LSB. It partitions all 0 bit on left side and all 1 bit on the right side. The side with contains k smallest elements is solved recursively. My partition process works just fine but I am having problem dealing with recursive function calls. I am unable to stop the recursion. Please help me on that!
My kernel function looks like this: This is kernel.h
#include "header.h"
#define WARP_SIZE 32
#define BLOCK_SIZE 32
__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x;
int WarpID = threadID >> 5;
int LocWarpID = threadID - 32 * WarpID;
int NumWarps = N / WARP_SIZE;
int pivot;
__shared__ int DataPartition[BLOCK_SIZE];
__shared__ int DataBinary[WARP_SIZE];
for(int i = 0; i < NumWarps; i++)
{
if(LocWarpID >= firstidx && LocWarpID <=lastidx)
{
int r = d_DataIn[i * WARP_SIZE + LocWarpID];
int p = (r>>(31-bit))&1;
unsigned int B = __ballot(p);
unsigned int B_flip = ~B;
if(p==1)
{
int b = B << (32-LocWarpID);
int RightLoc = __popc(b);
DataPartition[lastidx - RightLoc] = r;
}
else
{
int b_flip = B_flip << (32 - LocWarpID);
int LeftLoc = __popc(b_flip);
DataPartition[LeftLoc] = r;
}
if(LocWarpID <= lastidx - __popc(B))
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
else
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
pivot = lastidx - __popc(B);
return pivot+1;
}
}
}
__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
if(firstidx == lastidx)
return *d_DataIn;
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit);
int length = q - firstidx;
if(k == length)
return *d_DataIn;
else if(k < length)
return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1);
else
return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1);
}
__global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit);
}
Host code is main.cu and it looks like:
#include "header.h"
#include <iostream>
#include <fstream>
#include "kernel.h"
#define BLOCK_SIZE 32
using namespace std;
int main()
{
int N = 32;
thrust::host_vector<float>h_HostFloat(N);
thrust::counting_iterator <unsigned int> Numbers(0);
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(), RandomFloatNumbers(1.f, 100.f));
thrust::host_vector<int>h_HostInt(N);
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt());
thrust::device_vector<float>d_DeviceFloat = h_HostFloat;
thrust::device_vector<int>d_DeviceInt(N);
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt());
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data());
int *h_DataOut;
float *h_DataOut1;
int fsize = N * sizeof(float);
int size = N * sizeof(int);
h_DataOut = new int[size];
h_DataOut1 = new float[fsize];
int firstidx = 0;
int lastidx = BLOCK_SIZE-1;
int k = 20;
int bit = 1;
int NUM_BLOCKS = N / BLOCK_SIZE;
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit);
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost);
WriteData(h_DataOut1, h_DataOut, 10, N);
return 0;
}
List of headers that I used:
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/generate.h>
#include "functor.h"
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
Another header file "functor.h" to convert floating point numbers to int type and to generate random floating numbers.
#include <thrust/random.h>
#include <sstream>
#include <fstream>
#include <iomanip>
struct RandomFloatNumbers
{
float a, b;
__host__ __device__
RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {};
__host__ __device__
float operator() (const unsigned int n) const{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a,b);
rng.discard(n);
return dist(rng);
}
};
struct FloatToInt
{
__host__ __device__
int operator() (const float &x)
const {
union {
float f_value;
int i_value;
} value;
value.f_value = x;
return value.i_value;
}
};
float IntToFloat(int &x)
{
union{
float f_value;
int i_value;
}value;
value.i_value = x;
return value.f_value;
}
bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N)
{
std::ofstream data;
std::stringstream file;
file << "out\\Partition_";
file << std::setfill('0') <<std::setw(2) << bit;
file << ".txt";
data.open((file.str()).c_str());
if(data.is_open() == false)
{
std::cout << "File is not open" << std::endl;
return false;
}
for(int i = 0; i < N; i++)
{
h_DataOut1[i] = IntToFloat(h_DataOut[i]);
//cout << h_HostFloat[i] << " \t" << h_DataOut1[i] << endl;
//std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit);
//data << bitshift[31-bit] << "\t" <<h_DataOut1[i] <<std::endl;
data << h_DataOut1[i] << std::endl;
}
data << std::endl;
data.close();
std::cout << "Partition=" <<bit <<"\n";
return true;
}
Per your request, I'm posting the code I used to investigate this and help me in studying your code.
#include <stdio.h>
#include <stdlib.h>
__device__ int gpu_partition(unsigned int *data, unsigned int *partition, unsigned int *ones, unsigned int* zeroes, int bit, int idx, unsigned int* warp_ones){
int one = 0;
int valid = 0;
int my_one, my_zero;
if (partition[idx]){
valid = 1;
if(data[idx] & (1ULL<<(31-bit))) one=1;}
__syncthreads();
if (valid){
if (one){
my_one=1;
my_zero=0;}
else{
my_one=0;
my_zero=1;}
}
else{
my_one=0;
my_zero=0;}
ones[idx]=my_one;
zeroes[idx]=my_zero;
unsigned int warp_one = __popc(__ballot(my_one));
if (!(threadIdx.x & 31))
warp_ones[threadIdx.x>>5] = warp_one;
__syncthreads();
// reduce
for (int i = 16; i > 0; i>>=1){
if (threadIdx.x < i)
warp_ones[threadIdx.x] += warp_ones[threadIdx.x + i];
__syncthreads();}
return warp_ones[0];
}
__global__ void gpu_radixkernel(unsigned int *data, unsigned int m, unsigned int n, unsigned int *result){
__shared__ unsigned int loc_data[1024];
__shared__ unsigned int loc_ones[1024];
__shared__ unsigned int loc_zeroes[1024];
__shared__ unsigned int loc_warp_ones[32];
int l=0;
int bit = 0;
unsigned int u = n;
if (n<2){
if ((n == 1) && !(threadIdx.x)) *result = data[0];
return;}
loc_data[threadIdx.x] = data[threadIdx.x];
loc_ones[threadIdx.x] = (threadIdx.x<n)?1:0;
__syncthreads();
unsigned int *next = loc_ones;
do {
int s = gpu_partition(loc_data, next, loc_ones, loc_zeroes, bit++, threadIdx.x, loc_warp_ones);
if ((u-s) > m){
u = (u-s);
next = loc_zeroes;}
else{
l = (u-s);
next = loc_ones;}}
while ((u != l) && (bit<32));
if (next[threadIdx.x]) *result = loc_data[threadIdx.x];
}
int partition(unsigned int *data, int l, int u, int bit){
unsigned int *temp = (unsigned int *)malloc(((u-l)+1)*sizeof(unsigned int));
int pos = 0;
for (int i = l; i<=u; i++)
if(data[i] & (1ULL<<(31-bit))) temp[pos++] = data[i];
int result = u-pos;
for (int i = l; i<=u; i++)
if(!(data[i] & (1ULL<<(31-bit)))) temp[pos++] = data[i];
pos = 0;
for (int i = u; i>=l; i--)
data[i] = temp[pos++];
free(temp);
return result;
}
unsigned int radixselect(unsigned int *data, int l, int u, int m, int bit){
if (l == u) return(data[l]);
if (bit > 32) {printf("radixselect fail!\n"); return 0;}
int s = partition(data, l, u, bit);
if (s>=m) return radixselect(data, l, s, m, bit+1);
return radixselect(data, s+1, u, m, bit+1);
}
int main(){
unsigned int data[8] = {32767, 22, 88, 44, 99, 101, 0, 7};
unsigned int data1[8];
for (int i = 0; i<8; i++){
for (int j=0; j<8; j++) data1[j] = data[j];
printf("value[%d] = %d\n", i, radixselect(data1, 0, 7, i, 0));}
unsigned int *d_data;
cudaMalloc((void **)&d_data, 1024*sizeof(unsigned int));
unsigned int h_result, *d_result;
cudaMalloc((void **)&d_result, sizeof(unsigned int));
cudaMemcpy(d_data, data, 8*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int i = 0; i < 8; i++){
gpu_radixkernel<<<1,1024>>>(d_data, i, 8, d_result);
cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf("gpu result index %d = %d\n", i, h_result);
}
unsigned int data2[1024];
unsigned int data3[1024];
for (int i = 0; i < 1024; i++) data2[i] = rand();
cudaMemcpy(d_data, data2, 1024*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int i = 0; i < 1024; i++){
for (int j = 0; j<1024; j++) data3[j] = data2[j];
unsigned int cpuresult = radixselect(data3, 0, 1023, i, 0);
gpu_radixkernel<<<1,1024>>>(d_data, i, 1024, d_result);
cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (h_result != cpuresult) {printf("mismatch at index %d, cpu: %d, gpu: %d\n", i, cpuresult, h_result); return 1;}
}
printf("Finished\n");
return 0;
}
Here are some notes, in no particular order:
I got rid of all your thrust code, it's not doing anything useful as far as the radix select algorithm is concerned. I also find your casting of float to int curious. I haven't thought through the ramifications of trying to do a bitwise radix select in order on a sequence of exponent bits followed by a sequence of mantissa bits. It might work, (although I think if you include the sign bit, it definitely won't work) but again I don't think it's central to understanding the algorithm.
I included a host version that I wrote just to check my device results.
I'm pretty sure this algorithm will fail in some cases where there are duplicated elements. For example, if you hand it a vector of all zeroes, I think it will fail. I don't think it would be difficult to handle that case however.
my host version is recursive, but my device version is not. I don't see that recursion is that useful here, since the non-recursive form of the algorithm is easy to write as well, especially since there are at most 32 bits to travel through. Still, if you wanted to create a recursive device version, it should not be difficult, by incorporating the u,s, and l manipulation code inside the partition function.
I have dispensed with typical cuda error checking. However I recommend it.
I don't consider this to be a paragon of cuda programming. If you delve into for example a radix sort algorithm (such as here), you will see that it is pretty complex. A fast GPU radix select would look nothing like my code. I wrote my code to be analogous to the serial recursive partitioned radix sort, which is not the best way to do it on a massively parallel architecture.
Since radix select is not a sort, I attempted to write a device code that would do no data movement of the input data, since I considered this to be expensive and unnecessary. I do a single read from global memory for the data at the beginning of the kernel, and thereafter I do all work out of shared memory, and even in shared memory I am not re-arranging the data (as I do in my host version) so as to avoid the cost of data movement. Instead I keep flag arrays of ones and zeroes partitions, to feed to the next partitioning step. The data movement would involve a fair amount of uncoalesced and/or bank-conflicted traffic, whereas the flag arrays allow all accesses to be non-bank-conflicted.
I need copy a matrix that I have set up as M into a new matrix M2 and output that matrix.
How can this be done?
Here's what I tried so far:
#include <iostream>
using namespace std;
#define N 24
void copy(int M[][N], int M2[][N], int ROWS, int COLS)
{
int r, c;
M2[r][c]= M[r][c];
cout<< M2[r][c];
}
void print(int M[][N], int ROWS, int COLS)
{
int r, c, row, col;
row= 1;
col= 1;
M[row][col] = 2;
for(r=0; r< ROWS; r++)
{
for(c=0; c < COLS; c++)
{
if(M[r][c]==0)
{
cout<<" ";
}
else if (M[r][c]==1)
{
cout<< "T";
}
else if (M[r][c]==2)
{
cout<< "*";
}
else
{
cout << M[r][c];
}
}
cout <<endl;
}
}
void fill(int M[][N], int ROWS, int COLS, int row, int col)
{
int r, c;
for(r=0; r< ROWS; r++)
{
for(c=0; c < COLS; c++)
{
if (r == 0 || r == ROWS - 1) {
M[r][c]=0;
}
else if(c == 0 || c == COLS -1) {
M[r][c]=0;
}
else {
M[r][c]= 1;
}
}
}
}
int main()
{
int M[N/2][N];
int M2[N/2][N];
int ROWS, COLS;
int r, c;
ROWS = sizeof(M) / sizeof(M[0]);
COLS = sizeof(M[0]) / sizeof(M[0][0]);
fill(M, ROWS, COLS, 1, 1);
print(M, ROWS, COLS);
copy(M, M2, ROWS, COLS);
return 0;
}
Here is a problem:
int r, c;
M2[r][c]= M[r][c];
You never assigned r and c, they contain some unknown value, which might well be outside the range 0..ROWS-1 and 0..COLS-1.
Don't use uninitialized values, especially in pointer arithmetic.
To copy the entire matrix, you will probably need some loops like you have in the print function.
Since you are using two dimensional arrays to store the matrices, the copy of one into the other should be as simple as one call to memcpy. The contents of any array (regardless of dimension) are stored contiguously in memory.
In your copy function, just place the following line of code inside:
memcpy(M2, M1, r * c * sizeof(int));
Before the memcpy, make sure you have the appropriate values assigned to r and c (obviously this should be the correct number of rows and correct number of columns).
Hi I have written a program for Least common subsequence problem,
and stuck with 2-dim array passing and traversal. kindly helpy.
Following is the piece of code.
void backtrack(char x[], char y[], int L[][7], int m, int n)
{
if(m == 0 || n == 0)
return;
else if(x[m-1] == y[n-1])
{
backtrack(x, y, L, m-1, n-1);
cout << x[m-1] << " ";
}
else
{
if(L[m-1][n] > L[m][n-1])
backtrack(x, y, L, m-1, n);
else
backtrack(x, y, L, m, n-1);
}
}
int lcs_length(char x[], char y[], const int m, const int n)
{
int L[m+1][n+1];
for(int i=0; i<=m; i++)
{
for(int j=0; j<=n; j++)
{
if(i == 0 || j == 0)
L[i][j] = 0;
else if (x[i-1] == y[j-1])
L[i][j] = L[i-1][j-1] + 1;
else
L[i][j] = max (L[i-1][j], L[i][j-1]);
}
}
backtrack(x, y, L, m+1, n+1);
return L[m][n];
}
int main(int argc, char *argv[])
{
char x[] = "ABCDGH";
char y[] = "AEDFHR";
int m = sizeof x / sizeof *x;
int n = sizeof y / sizeof *y;
cout << lcs_length(x, y, m, n);
return EXIT_SUCCESS;
}
I am basically stuck in calling backtrack function from lcs_length() as I am not able to pass / traverse the 2-dim array within backtrack...
kindly help..
thanks.
Your two-dimensional array L in lcs_length is not a real c-array because you use variables to set the number of elements. When setting its size e.g. to int L[100][7] everything works fine.
But to solve your problem I would rather use a real dynamic array, for example: int **L = new int* [m+1];
for (int i = 0; i < m+1; i++)
L[i] = new int[n+1];
Then you can pass the array as int ** L.