Cuda passing char** to kernel - c++

I am having a spot of bother with this basic CUDA code.
I have a char** which is a flat 2d array of passwords, my current implementation is for CUDA simply to iterate through this list and display the passwords. However, when I go to display them I simply get "(NULL)". I'm not quite sure why this is. Can someone explain what it happening?
Main:
char ** pwdAry;
pwdAry = new char *[numberOfPwd];
//pwdAry given some values (flat 2d array layout)
const int pwdArySize = sizeof(pwdAry);
dim3 grid(gridSize,gridSize);
dim3 block(blockSize,blockSize);
searchKeywordKernel << <grid, block >> >(pwdAry);
return EXIT_SUCCESS;
Cuda:
__global__ void searchKeywordKernel(char **passwordList)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int pitch = blockDim.x * gridDim.x;
int idx = x + y * pitch;
int tidy = idx / pitch;
int tidx = idx - (pitch * tidy);
int bidx = tidx / blockDim.x;
int bidy = tidy / blockDim.y;
int currentThread = threadIdx.x + blockDim.x * threadIdx.y;
printf("hi, i am thread: %i, and my block x: %i, and y: %i\n", currentThread, bidx, bidy);
printf("My password is: %s\n", passwordList[currentThread]);
}

Based on discussion in the comments, here is an example code that roughly follows the code in the question, using 3 different methods:
Use a "flattened" array. This is the traditional advice for beginners who are asking about how to handle a double pointer array (char **, or any other type), or any data structure that contains embedded pointers. The basic idea is to create a single pointer array of the same type (e.g. char *), and copy all the data to that array, end-to-end. In this case, since the array elements are of variable length, we also need to pass an array containing the starting indices of each string (in this case).
Use a direct double-pointer method. I consider this code difficult to write. It may also have performance implications. The canonical example is here, and a stepwise description of what is required algorithmically is here and/or here is a 3D (i.e. triple-pointer) worked example with method description (yuck!). This is fundamentally doing a deep-copy in CUDA, and I consider it somewhat more difficult than typical CUDA coding.
Use the managed memory subsystem, that is available in CUDA platforms that support it. Coding-wise, this is probably simpler than either of the above 2 approaches.
Here is a worked example of all 3 methods:
$ cat t1035.cu
#include <stdio.h>
#include <string.h>
#define nTPB 256
__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}
__global__ void kern_2D(char **data, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}
int main(){
const int num_strings = 3;
const char s0[] = "s1\0";
const char s1[] = "s2\0";
const char s2[] = "s3\0";
int ds[num_strings];
ds[0] = sizeof(s0)/sizeof(char);
ds[1] = sizeof(s1)/sizeof(char);
ds[2] = sizeof(s2)/sizeof(char);
// pretend we have a dynamically allocated char** array
char **data;
data = (char **)malloc(num_strings*sizeof(char *));
data[0] = (char *)malloc(ds[0]*sizeof(char));
data[1] = (char *)malloc(ds[1]*sizeof(char));
data[2] = (char *)malloc(ds[2]*sizeof(char));
// initialize said array
strcpy(data[0], s0);
strcpy(data[1], s1);
strcpy(data[2], s2);
// method 1: "flattening"
char *fdata = (char *)malloc((ds[0]+ds[1]+ds[2])*sizeof(char));
unsigned *ind = (unsigned *)malloc(num_strings*sizeof(unsigned));
unsigned next = 0;
for (int i = 0; i < num_strings; i++){
strcpy(fdata+next, data[i]);
ind[i] = next;
next += ds[i];}
//copy to device
char *d_fdata;
unsigned *d_ind;
cudaMalloc(&d_fdata, next*sizeof(char));
cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
printf("method 1:\n");
kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
cudaDeviceSynchronize();
//method 2: "2D" (pointer-to-pointer) array
char **d_data;
cudaMalloc(&d_data, num_strings*sizeof(char *));
char **d_temp_data;
d_temp_data = (char **)malloc(num_strings*sizeof(char *));
for (int i = 0; i < num_strings; i++){
cudaMalloc(&(d_temp_data[i]), ds[i]*sizeof(char));
cudaMemcpy(d_temp_data[i], data[i], ds[i]*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
printf("method 2:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
cudaDeviceSynchronize();
// method 3: managed allocations
// start over with a managed char** array
char **m_data;
cudaMallocManaged(&m_data, num_strings*sizeof(char *));
cudaMallocManaged(&(m_data[0]), ds[0]*sizeof(char));
cudaMallocManaged(&(m_data[1]), ds[1]*sizeof(char));
cudaMallocManaged(&(m_data[2]), ds[2]*sizeof(char));
// initialize said array
strcpy(m_data[0], s0);
strcpy(m_data[1], s1);
strcpy(m_data[2], s2);
// call kernel directly on managed data
printf("method 3:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_35 -o t1035 t1035.cu
$ cuda-memcheck ./t1035
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 2:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 3:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
========= ERROR SUMMARY: 0 errors
$
Notes:
I suggest running this code with cuda-memcheck if you are just testing it out for the first time. I have omitted proper cuda error checking for brevity of presentation, but I recommend it any time you are having trouble with a CUDA code. Proper execution of this code depends on having a managed memory subsystem available (read the doc links I have provided). If your platform does not support it, running this code as-is will probably result in a seg fault, because I have not included proper error checking.
Copying a double-pointer array from device to host, although not explicitly covered in this example, is essentially the reverse of the steps for each of the 3 methods. For method 1, a single cudaMemcpy call can do it. For method 2, it requires a for-loop that reverses the steps to copy to the device (including the use of the temp pointers). For method 3, nothing at all is required, other than proper adherence to managed memory coding practices, such as use of cudaDeviceSynchronize() after a kernel call, before attempting to access the device from host code again.
I don't wish to argue about whether or not methods 1 and 3 explicitly adhere to the letter of the question in terms of providing a method to pass a char ** array to a CUDA kernel. If your focus is that narrow, then please use method 2, or else disregard this answer entirely.
EDIT: Based on a question in the comments below, here is the above code modified with a different initialization sequence for the host-side strings (at line 42). There are now compilation warnings, but those warnings arise from the code specifically requested to be used by OP:
$ cat t1036.cu
#include <stdio.h>
#include <string.h>
#define nTPB 256
__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}
__global__ void kern_2D(char **data, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}
int main(){
const int num_strings = 3;
#if 0
const char s0[] = "s1\0";
const char s1[] = "s2\0";
const char s2[] = "s3\0";
int ds[num_strings];
ds[0] = sizeof(s0)/sizeof(char);
ds[1] = sizeof(s1)/sizeof(char);
ds[2] = sizeof(s2)/sizeof(char);
// pretend we have a dynamically allocated char** array
char **data;
data = (char **)malloc(num_strings*sizeof(char *));
data[0] = (char *)malloc(ds[0]*sizeof(char));
data[1] = (char *)malloc(ds[1]*sizeof(char));
data[2] = (char *)malloc(ds[2]*sizeof(char));
// initialize said array
strcpy(data[0], s0);
strcpy(data[1], s1);
strcpy(data[2], s2);
#endif
char ** pwdAry; pwdAry = new char *[num_strings]; for (int a = 0; a < num_strings; a++) { pwdAry[a] = new char[1024]; } for (int a = 0; a < 3; a++) { pwdAry[a] = "hello\0"; }
// method 1: "flattening"
char *fdata = (char *)malloc((1024*num_strings)*sizeof(char));
unsigned *ind = (unsigned *)malloc(num_strings*sizeof(unsigned));
unsigned next = 0;
for (int i = 0; i < num_strings; i++){
memcpy(fdata+next, pwdAry[i], 1024);
ind[i] = next;
next += 1024;}
//copy to device
char *d_fdata;
unsigned *d_ind;
cudaMalloc(&d_fdata, next*sizeof(char));
cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
printf("method 1:\n");
kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
cudaDeviceSynchronize();
//method 2: "2D" (pointer-to-pointer) array
char **d_data;
cudaMalloc(&d_data, num_strings*sizeof(char *));
char **d_temp_data;
d_temp_data = (char **)malloc(num_strings*sizeof(char *));
for (int i = 0; i < num_strings; i++){
cudaMalloc(&(d_temp_data[i]), 1024*sizeof(char));
cudaMemcpy(d_temp_data[i], pwdAry[i], 1024*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
printf("method 2:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
cudaDeviceSynchronize();
// method 3: managed allocations
// start over with a managed char** array
char **m_data;
cudaMallocManaged(&m_data, num_strings*sizeof(char *));
cudaMallocManaged(&(m_data[0]), 1024*sizeof(char));
cudaMallocManaged(&(m_data[1]), 1024*sizeof(char));
cudaMallocManaged(&(m_data[2]), 1024*sizeof(char));
// initialize said array
for (int i = 0; i < num_strings; i++)
memcpy(m_data[i], pwdAry[i], 1024);
// call kernel directly on managed data
printf("method 3:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_35 -o t1036 t1036.cu
t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated
t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated
$ cuda-memcheck ./t1036
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 2:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 3:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
========= ERROR SUMMARY: 0 errors
$

Related

Find the index of an element in an array with cuda c++

I am new with cuda.
I have two arrays:
int* AA = new int[5]{1,2,3,4,5};
int* BB = new int[5]{ 2,2,2,4,4 };
and I want to find the index of every element in AA that is equal to each element in BB that in this case is
{1,1,1,3,3}
here is My code:
__global__ void findIndex(int* A, int* B, int* C)
{
int i = threadIdx.x;
for (int j = 0; j < 5; j++)
{
if (B[i] == A[j])
{
C[i] = j;
}
}
}
int main() {
int* AA = new int[5]{1,2,3,4,5};
int* BB = new int[5]{ 2,2,2,4,4 };
int* CC = new int[5]{ 0,0,0,0,0 };
int(*ppA), (*ppB), (*ppC);
cudaMalloc((void**)&ppA, (5) * sizeof(int));
cudaMalloc((void**)&ppB, (5) * sizeof(int));
cudaMalloc((void**)&ppC, (5) * sizeof(int));
cudaMemcpy(ppA, AA, 5 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(ppB, BB, 5 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(ppC, CC, 5 * sizeof(int), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(5);
findIndex << <numBlocks, threadsPerBlock >> > (ppA, ppB, ppC);
cudaMemcpy(CC, ppC, 5 * sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < 5; m++) {
printf("%d ", CC[m]);
}
}
My output is:
{1,2,3,0,0}
Can anyone help?
Simplest non-stable single-gpu solution would be using atomics, something like this:
__global__ void find(int * arr,int * counter, int * result)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
if(arr[id] == 4)
{
int ctr = atomicAdd(counter,1);
result[ctr] = id;
}
}
this way you can have an array of results in "result" array and if the wanted number is sparse it wouldn't slowdown much (like only few in whole source array). This is not an optimal way for multi-gpu systems, though. Requires host-side coordination between gpus, unless a special CUDA feature from newest toolkit is used (for system-level atomics).
If number of 4s leads to a "dense" arr array or if you have multiple gpus, then you should look for other solutions like stream compaction. First select the cells containing 4 as a mask. Then do the compaction. Some Nvidia blogs or tutorials have this algorithm.
For the "atomic" solution (especially on "shared" memory atomics), Maxwell (and onwards) architecture is much better than Kepler, just in case you still use a Kepler. Also using atomics is not exactly reproducible as the order of atomic operations can not be known. You will get a differently-ordered result array most of the time. But the stream compaction preserves the result order. This may save you from writing a sorting algorithm (like bitonic-sort, shear-sort, etc) on top of it.

Allreduce with user defined function and MPI_BOTTOM

Consider the following program that is supposed to do some stupid addition of doubles:
#include <iostream>
#include <vector>
#include <mpi.h>
void add(void* invec, void* inoutvec, int* len, MPI_Datatype*)
{
double* a = reinterpret_cast <double*> (inoutvec);
double* b = reinterpret_cast <double*> (invec);
for (int i = 0; i != *len; ++i)
{
a[i] += b[i];
}
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
std::vector<double> buffer = { 2.0, 3.0 };
MPI_Op operation;
MPI_Op_create(add, 1, &operation);
MPI_Datatype types[1];
MPI_Aint addresses[1];
int lengths[1];
int count = 1;
MPI_Get_address(buffer.data(), &addresses[0]);
lengths[0] = buffer.size();
types[0] = MPI_DOUBLE;
MPI_Datatype type;
MPI_Type_create_struct(count, lengths, addresses, types, &type);
MPI_Type_commit(&type);
MPI_Allreduce(MPI_IN_PLACE, MPI_BOTTOM, 1, type, operation, MPI_COMM_WORLD);
MPI_Type_free(&type);
MPI_Op_free(&operation);
MPI_Finalize();
std::cout << buffer[0] << " " << buffer[1] << "\n";
}
Because this is part of larger program where the data I want to send is 1) on the heap and 2) consists of different types I think I have to use a user-defined type.
Now something must be wrong here because the program crashes when run with mpirun -n 2 ./a.out. The backtrace from gdb is:
#0 __memcpy_sse2_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S:158
#1 0x00007ffff65de460 in non_overlap_copy_content_same_ddt () from /usr/local/lib/libopen-pal.so.6
#2 0x00007ffff180a69b in ompi_coll_tuned_allreduce_intra_recursivedoubling () from /usr/local/lib/openmpi/mca_coll_tuned.so
#3 0x00007ffff793bb8b in PMPI_Allreduce () from /usr/local/lib/libmpi.so.1
#4 0x00000000004088b6 in main (argc=1, argv=0x7fffffffd708) at mpi_test.cpp:39
Line 39 is the MPI_Allreduce call. This is probably a dumb mistake, but after staring on it for hours I still don't see it. Does anyone spot the mistake? Thanks!
Edit: There is a bug in how Open MPI handles types with non-zero lower bounds (such as the one that you create when using absolute addresses) while performing in-place reduce-to-all. It seems to exist in all versions, including the development branch. The status can be tracked by following the issue on GitHub.
Your add operator is wrong as you fail to account for the datatype's lower bound. A proper solution would be something like:
void add(void* invec, void* inoutvec, int* len, MPI_Datatype* datatype)
{
MPI_Aint lb, extent;
MPI_Type_get_true_extent(*datatype, &lb, &extent);
double* a = reinterpret_cast <double*> (reinterpret_cast <char*>(inoutvec) + lb);
double* b = reinterpret_cast <double*> (reinterpret_cast <char*>(invec) + lb);
for (int i = 0; i != *len; ++i)
{
a[i] += b[i];
}
}
This will access the data correctly but is still wrong. *len will be 1 as that is what you pass to MPI_Allreduce but there are two doubles behind each element. The correctly written operator will either use the type introspection mechanism to obtain the length of the block of doubles and multiply *len by it or simply hardcode the vector length to be two:
for (int i = 0; i < 2*(*len); i++)
{
a[i] += b[i];
}

CUDA: Group every n-th point of array passed to GPU

I am trying to implement k-means algorithm on CUDA using Tesla card on external Unix. I read input file and store coordinates of all data points in dataX and dataY arrays. The next step is to select every centreInterval-th point and store it in another array allocated in GPU memory. However, I have no idea how may I even check what's the problem if all I can get is 'Segmentation error' and from obvious reasons can't print any kind of output from kernel.
EDIT 2: I simplified this example to the shortest possible solution. I found my solution during process, but decided to provide the version, which was not solved yet in this question to make more clear what caused the problem.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16
// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int idx = i * centreInterval;
d_centresX[i] = d_dataX[idx];
d_centresY[i] = d_dataY[idx];
}
// Simplified example
int main(int argn, char ** argc) {
// My data - let's say it is 32 floats in each
int dataSize = 32;
float* dataX = new float[dataSize];
float* dataY = new float[dataSize];
// Fill arrays with numbers
for (int i = 0; i < dataSize; i++) {
dataX[i] = i;
dataY[i] = i;
}
// Interval - we select first number, then 1 + N * centreInterval
int centreInterval = 2;
// There I will store my results in program
int centreSize = dataSize / centreInterval;
float* centresX = new float[centreSize];
float* centresY = new float[centreSize];
// Pointers to the arrays stored in GPU memory
float* d_dataX;
float* d_dataY;
float* d_centresX;
float* d_centresY;
// Allocate memory for those arrays
// Calculate how much space in memory do we need for this
size_t d_centreSize = sizeof(float) * centreSize;
size_t d_dataSize = sizeof(float) * dataSize;
// Memory for raw data
cudaMalloc((void**)&d_dataX, d_dataSize);
cudaMalloc((void**)&d_dataY, d_dataSize);
// Copy raw data to the device memory so we can operate on it freely
cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);
// Memory for centre results
cudaMalloc((void**)&d_centresX, d_dataSize);
cudaMalloc((void**)&d_centresY, d_dataSize);
// Call kernel
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);
// Check results - we get every n-th point
float* check_x = new float[centreSize];
float* check_y = new float[centreSize];
cudaMemcpy(check_x, d_centresX, d_dataSize, cudaMemcpyDeviceToHost);
cudaMemcpy(check_y, d_centresY, d_dataSize, cudaMemcpyDeviceToHost);
printf("X: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_x[i]);
printf("\nY: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_y[i]);
printf("\n");
}
Main question: What is wrong with this kernel / check-out of data?
Side question: Is there any fair way to debug program kernels in such situations?
So, here's the solution I came up with after simplifying my case. There was a problem with memory usage - I tried to store / read different amount of data than I claimed to use when allocating it. I hope it will be helpful for anyone in the future:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16
// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int idx = i * centreInterval;
d_centresX[i] = d_dataX[idx];
d_centresY[i] = d_dataY[idx];
}
// Simplified example
int main(int argn, char ** argc) {
// My data - let's say it is 32 floats in each
int dataSize = 32;
float* dataX = new float[dataSize];
float* dataY = new float[dataSize];
// Fill arrays with numbers
for (int i = 0; i < dataSize; i++) {
dataX[i] = i;
dataY[i] = i;
}
// Interval - we select first number, then 1 + N * centreInterval
int centreInterval = 2;
// There I will store my results in program
int centreSize = dataSize / centreInterval;
float* centresX = new float[centreSize];
float* centresY = new float[centreSize];
// Pointers to the arrays stored in GPU memory
float* d_dataX;
float* d_dataY;
float* d_centresX;
float* d_centresY;
// Allocate memory for those arrays
// Calculate how much space in memory do we need for this
size_t d_centreSize = sizeof(float) * centreSize;
size_t d_dataSize = sizeof(float) * dataSize;
// Memory for raw data
cudaMalloc((void**)&d_dataX, d_dataSize);
cudaMalloc((void**)&d_dataY, d_dataSize);
// Copy raw data to the device memory so we can operate on it freely
cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);
// Memory for centre results
cudaMalloc((void**)&d_centresX, d_centreSize);
cudaMalloc((void**)&d_centresY, d_centreSize);
// Call kernel
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);
// Check results - we get every n-th point
float* check_x = new float[centreSize];
float* check_y = new float[centreSize];
cudaMemcpy(check_x, d_centresX, d_centreSize, cudaMemcpyDeviceToHost);
cudaMemcpy(check_y, d_centresY, d_centreSize, cudaMemcpyDeviceToHost);
printf("X: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_x[i]);
printf("\nY: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_y[i]);
printf("\n");
}

Matrix Multiplication with CUDA, long execution time

I'm new to CUDA, and I been trying to figure out what I'm doing wrong here. CUDA is taking longer than just using the CPU to multiply a matrix. If I'm doing something wrong please let me know.
Here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <assert.h>
#include <time.h>
#define size 100 // Matrix size
#define cols size // Matrix width
#define rows size // Matrix height
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
__global__ void matrixMul( int *A, int *B, int *C)
{
int bx = blockIdx.x; // Block index
int tx = threadIdx.x; // Thread index
int ts = blockDim.x; // number of threads
// Declaration of the shared memory C element
extern __shared__ int c_element_sum[];
c_element_sum[tx] = A[tx+((bx/ts)*ts)] * B[(bx%ts)+(tx*ts)];
//Block until all threads in the block have written their data to shared mem
__syncthreads();
int sum;
for(int i=0; i<ts; i++){
if(i==0){
sum=c_element_sum[i];
}
else{
sum+=c_element_sum[i];
}
}
C[bx] = sum;
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
//create timer.
clock_t t1, t2;
//start timer
t1=clock();
//allocate host memory for matrices
unsigned int size_A = cols * rows;
unsigned int mem_size_A = sizeof(int) * size_A;
int* mA = (int*) malloc(mem_size_A);
unsigned int size_B = cols * rows;
unsigned int mem_size_B = sizeof(int) * size_B;
int* mB = (int*) malloc(mem_size_B);
unsigned int size_C = cols * rows;
unsigned int mem_size_C = sizeof(int) * size_C;
int* mC = (int*) malloc(mem_size_C);
//initialize host memory
for (int i = 0; i < size_A; ++i){
mA[i] = 1;
mB[i] = 1;
mC[i] = 0;
}
// allocate device memory
int* d_mA;
int* d_mB;
int* d_mC;
cudaMalloc((void**) &d_mA, mem_size_A);
cudaMalloc((void**) &d_mB, mem_size_B);
cudaMalloc((void**) &d_mC, mem_size_C);
//copy host memory to device (A and B)
cudaMemcpy(d_mA, mA, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_mB, mB, mem_size_B, cudaMemcpyHostToDevice);
cudaMemcpy(d_mC, mC, mem_size_C, cudaMemcpyHostToDevice);
// setup execution parameters
int numThreadsPerBlock = cols;
int numBlocks = (cols * rows);
int sharedMemSize = numThreadsPerBlock * sizeof(int);
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
// execute the kernel
matrixMul <<< dimGrid, dimBlock, sharedMemSize >>>(d_mA, d_mB, d_mC);
//Block until device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
//copy result from device to host
cudaMemcpy(mC, d_mC, mem_size_C, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("memcpy");
//stop timer
t2 = clock();
//check results
for (int i = 0; i < size_C; ++i){
assert(mC[i] == cols);
}
//clean up memory
free(mA);
free(mB);
free(mC);
cudaFree(d_mA);
cudaFree(d_mB);
cudaFree(d_mC);
printf("WITH CUDA - clocks: %d \n\n", t2-t1);
//////////////////////////////
///////// CPU ONLY //////////
/////////////////////////////
//create timer.
clock_t cpu_t1, cpu_t2;
//start timer
cpu_t1=clock();
//allocate host memory for matrices
unsigned int cpu_size_A = cols * rows;
unsigned int cpu_mem_size_A = sizeof(int) * cpu_size_A;
int* cpu_mA = (int*) malloc(cpu_mem_size_A);
unsigned int cpu_size_B = cols * rows;
unsigned int cpu_mem_size_B = sizeof(int) * cpu_size_B;
int* cpu_mB = (int*) malloc(cpu_mem_size_B);
unsigned int cpu_size_C = cols * rows;
unsigned int cpu_mem_size_C = sizeof(int) * cpu_size_C;
int* cpu_mC = (int*) malloc(cpu_mem_size_C);
//initialize host memory
for (int i = 0; i < cpu_size_A; ++i){
cpu_mA[i] = 1;
cpu_mB[i] = 1;
cpu_mC[i] = 0;
}
int ts = cols;
for(int bx=0; bx<(cols*rows);bx++){
int sum = 0;
for(int tx=0; tx<cols; tx++){
sum += cpu_mA[tx+((bx/ts)*ts)] * cpu_mB[(bx%ts)+(tx*ts)];
}
cpu_mC[bx]=sum;
}
//stop timer
cpu_t2 = clock();
//check results
for (int i = 0; i < cpu_size_C; ++i){
assert(cpu_mC[i] == cols);
}
//clean up memory
free(cpu_mA);
free(cpu_mB);
free(cpu_mC);
printf("CPU ONLY - clocks: %d \n\n", cpu_t2-cpu_t1);
return 0;
}
Based on your program, this is expected. Your timer looks like it clocks the entire execution of the program, which would include copying to the device, computation time, and copying the results back. Given the rather small workload you've provided for the program (100x100 matrices), the overhead of the memory copies far outweighs any computational benefit you get when doing the computation with the kernel. Your kernel itself is also not the most efficient implementation.
I don't think you're doing anything wrong, it's just that you haven't provided a large enough chunk of work for the GPU and you could potentially further optimize your kernel. Note that simply scaling up the size of the chunk may not significantly improve the performance with respect to the CPU, since you would also be scaling up the memory management time. While it is relatively simple to write a first implementation of a program on CUDA, it is significantly more difficult to get good performance out of it. The most effective way to use CUDA is to have a high ratio of compute to memory transactions. For example, having a pipeline of several compute-intensive kernels to operate successively on a chunk of data, only needing host-device copying at the beginning and end.
If this is just a program to help you learn to code for CUDA, this is a great step and getting a deep understanding of how to optimize matrix multiplication kernels will serve you well in many other cases. If you are writing this kernel for use in a production piece of software, I would recommend you use the highly-optimized linear algebra library CUBLAS: http://developer.nvidia.com/cublas (or some other library where the hard work has been done for you already).

Help Needed!! Compiling C++ program - errors

This is a routine that I believe is for C. I copied it (legally) out a book and am trying to get it to compile and run in visual studio 2008. I would like to keep it as a C++ program. Lots of programming experience in IBM mainframe assembler but none in C++. Your help is greatly appreciated. I think just a couple of simple changes but I have read tutorials and beat on this for hours - getting nowhere. Getting a lot (4) of error C2440 '=' : cannot convert form 'void*' to to 'int*' errors in statements past the reedsolomon function: Thanks so much! Program follows:
#include <iostream>
using namespace std;
int wd[50] = {131,153,175,231,5,184,89,239,149,29,181,153,175,191,153,175,191,159,231,3,127,44,12,164,59,209,104,254,150,45};
int nd = 30, nc=20, i, j, k, *log, *alog, *c, gf=256, pp=301;
/* The following is routine which calculates the error correction codewords
for a given data codeword string of length "nd", stored as an integer array wd[].
The function ReedSolomon()first generates log and antilog tables for the Galois
Field of size "gf" (in the case of ECC 200, 28) with prime modulus "pp"
(in the case of ECC 200, 301), then uses them in the function prod(), first to
calculate coefficients of the generator polynomial of order "nc" and then to
calculate "nc" additional check codewords which are appended to the data in wd[].*/
/* "prod(x,y,log,alog,gf)" returns the product "x" times "y" */
int prod(int x, int y, int *log, int *alog, int gf)
{if (!x || !y)
return 0;
else
return alog[(log[x] + log[y]) % (gf-1)];
}
/* "ReedSolomon(wd,nd,nc,gf.pp)" takes "nd" data codeword values in wd[] */
/* and adds on "nc" check codewords, all within GF(gf) where "gf" is a */
/* power of 2 and "pp" is the value of its prime modulus polynomial */
void ReedSolomon(int *wd, int nd, int nc, int gf, int pp)
{int i, j, k, *log,*alog,*c;
/* allocate, then generate the log & antilog arrays: */
log = malloc(sizeof(int) * gf);
alog = malloc(sizeof(int) * gf);
log[0] = 1-gf; alog[0] = 1;
for (i = 1; i < gf; i++)
{alog[i] = alog[i-1] * 2;
if (alog[i] >= gf) alog[i] ^= pp;
log[alog[i]] = i;
}
/* allocate, then generate the generator polynomial coefficients: */
c = malloc(sizeof(int) * (nc+1));
for (i=1; i<=nc; i++) c[i] = 0; c[0] = 1;
for (i=1; i<=nc; i++)
{c[i] = c[i-1];
for (j=i-1; j>=1; j--)
{c[j] = c[j-1] ^ prod(c[j],alog[i],log,alog,gf);
}
c[0] = prod(c[0],alog[i],log,alog,gf);
}
/* clear, then generate "nc" checkwords in the array wd[] : */
for (i=nd; i<=(nd+nc); i++) wd[i] = 0;
for (i=0; i<nd; i++)
{k = wd[nd] ^ wd[i] ;
for (j=0; j<nc; j++)
{wd[nd+j] = wd[nd+j+1] ^ prod(k,c[nc-j-1],log, alog,gf);
}
}
free(c);
free(alog);
free(log);
return ();
}
int main ()
{reedsolomon (50,30,20,256,301);
for (i = 1; i < 51; i++)
{cout<< i; "="; wd[i];}
cout<<"HEY, you, I'm alive! Oh, and Hello World!\n";
cin.get();
return 1;
}
In C++, a void pointer can't be implicitly cast to a different pointer.
So instead of
int *pInt;
pInt = malloc(sizeof(int) * 5);
You need to say
int *pInt;
pInt = (int *) malloc(sizeof(int) * 5);
or preferably
int *pInt = new int[5];
(with a matching delete[] instead of free), or preferably preferably use a vector if it's intended to be dynamic.
At the beginning of the program type: #include <cstdlib> . If you do not include this library, then malloc will not work. In C++, void* to int* is not an automatic conversion, in lines: 31 32 and 40 you need to cast to int* e.g: log = (int *)malloc(sizeof(int) * gf);
At main funcion, line 63 you're calling the function as reedsolomon, it should be ReedSolomon, the way you declared it.
Also, in "void ReedSolomon(int *wd, int nd, int nc, int gf, int pp)" when you call the function in the main, you say ReedSolomon (50,30,20,256,301); so you are asigning an int value to a pointer to int, that's a type clash. I'm not sure what it is you want to do with wd.
Next time, please post the errors from the compiler so people dont have to compile the code themselves to check and see whats wrong.
Also a good technique which will save you a lot of time is to do a google search on the error the compiler gives you (it is very likely somebody already had that same mistake), and also read a C++ book to get acquainted with the language.
Cheers!
C++ requires that you cast the return value of malloc to whatever type of pointer you're assigning it to. So e.g. log = malloc(sizeof(int) * gf); needs to become log = (int *) malloc(sizeof(int) * gf);.
You should type cast when assigning a pointer to the return of malloc.
Example:
log = reinterpret_cast<int*>(malloc(sizeof(int) * gf));