Bitonic sorting in cuda misorders some values - c++

i'm making a sorting algorithm on CUDA for a bigger project and i decided implementing a Bitonic sorting. The number of elements i'll be sorting will be allways a power of two, in fact will be 512. I need an array which will have the final positions because this method will be used for ordering an array that represents the quality matrix of another solution.
fitness is the array i'll sort, numElements is the number of elements, and orden is initially an empty array with numElements positions which will be filled at the very beginning in this way: orden[i]=i. Actually orden is not relevant for this issue but I kept it.
My problem is that some values aren't sorted properly and until now i've been unable to figure out what problem do I have.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <ctime>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <device_functions.h>
#include "float.h"
__global__ void sorting(int * orden, float * fitness, int numElements);
// Populating array with random values for testing purposes
__global__ void populate( curandState * state, float * fitness{
curandState localState = state[threadIdx.x];
int a = curand(&localState) % 500;
fitness[threadIdx.x] = a;
}
//Curand setup for the populate method
__global__ void setup_cuRand(curandState * state, unsigned long seed)
{
int id = threadIdx.x;
curand_init(seed, id, 0, &state[id]);
}
int main()
{
float * arrayx;
int numelements = 512;
int * orden;
float arrayCPU[512] = { 0 };
curandState * state;
cudaDeviceReset();
cudaSetDevice(0);
cudaMalloc(&state, numelements * sizeof(curandState));
cudaMalloc((void **)&arrayx, numelements*sizeof(float));
cudaMalloc((void **)&orden, numelements*sizeof(int));
setup_cuRand << <1, numelements >> >(state, unsigned(time(NULL)));
populate << <1, numelements >> > (state, arrayx);
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < numelements; i++)
printf("fitness[%i] = %f\n", i, arrayCPU[i]);
sorting << <1, numelements >> >(orden, arrayx, numelements);
printf("\n\n");
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < numelements; i++)
printf("fitness[%i] = %f\n", i, arrayCPU[i]);
cudaDeviceReset();
return 0;
}
__device__ bool isValid(float n){
return !(isnan(n) || isinf(n) || n != n || n <= FLT_MIN || n >= FLT_MAX);
}
__global__ void sorting(int * orden, float * fitness, int numElements){
int i = 0;
int j = 0;
float f = 0.0;
int aux = 0;
//initial orden registered (1, 2, 3...)
orden[threadIdx.x] = threadIdx.x;
//Logarithm on base 2 of numElements
for (i = 2; i <= numElements; i = i * 2){
// descending from i reducing to half each iteration
for (j = i; j >= 2; j = j / 2){
if (threadIdx.x % j < j / 2){
__syncthreads();
// ascending or descending consideration using (threadIdx.x % (i*2) < i)
if ((threadIdx.x % (i * 2) < i) && (fitness[threadIdx.x] > fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x])) ||
((threadIdx.x % (i * 2) >= i) && (fitness[threadIdx.x] <= fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x + j / 2])))){
aux = orden[threadIdx.x];
orden[threadIdx.x] = orden[threadIdx.x + j / 2];
orden[threadIdx.x + j / 2] = aux;
//Se reubican los fitness
f = fitness[threadIdx.x];
fitness[threadIdx.x] = fitness[threadIdx.x + j / 2];
fitness[threadIdx.x + j / 2] = f;
}
}
}
}
}
For example, an output i got on a random execution:
A random execution
This is a representation of my bitonic sorting:
Bitonic sorting Schema, the arrows point where the worst of the values compared goes to

Here are the issues I found:
In your posted code, this does not compile:
__global__ void populate( curandState * state, float * fitness{
^
missing close parenthesis
I added a close parenthesis there.
It's not necessary to take the address of the array in these cudaMemcpy statements:
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
....
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
the array name is already the address of the array, so I removed the ampersands. If you use a dynamically allocated array, such usage would be broken.
Your usage of __syncthreads() here is broken:
for (j = i; j >= 2; j = j / 2){
if (threadIdx.x % j < j / 2){
__syncthreads();
usage of __syncthreads() inside a conditional statement is generally incorrect unless the conditional statement evaluates uniformly across the threadblock. This is covered in the documentation. We can achieve the desired effect with a slight change:
for (j = i; j >= 2; j = j / 2){
__syncthreads();
if (threadIdx.x % j < j / 2){
With the above changes, your code appears to run correctly for me, for most cases. Your usage of FLT_MIN in your validity check is also questionable, if you intend 0 (or any negative values) to be sorted correctly. Speaking generally, FLT_MIN is a number that is very small, close to zero. If you were thinking that this is a large negative number, it is not. As a result, zero is a possible output of your random number generator, and it will not be sorted correctly. I'll leave this one to you to fix, it should be straightforward, but it will depend on what you ultimately want to achieve. (If you only want to sort positive non-zero floating point values, the test may be OK, but in this case your random number generator can return 0.)

Related

Matrix initialization not working with higher values

I have to initialize a matrix which has to be later passed to a Cuda kernel. But I get a segmentation fault when I initialize the matrix. The code is as follows -
#include <iostream>
int main(){
size_t m = 512;
size_t k = 32;
size_t n = 32;
float* a = (float*) malloc(m * k * sizeof(float));
if(a == nullptr){
std::cout<<"Nullptr returned, Check Memory Hardware"<<std::endl;
exit(-1);
}
for(size_t i=0; i<m; i++){
std::cout<<i<<std::endl;
for(size_t j=0; j<k; j++){
std::cout<<j<<" ";
a[i*m + j] = 1.0f;
}
std::cout<<std::endl<<"=-=-=-=-=-=-=-=- ||||| =-=-=-=-=-=-=-=-=-=-=-=-=-"<<std::endl;
}
}
The code only works when m < 100;
For m < 100, It throws a segmentation fault at i = 97.
I was able to use matrices of side 2^12 with the exact same code.
My system configuration - RAM 16GB 2667 MT/s, i7 9750h and RTX 2070.
a[i*m + j] = 1.0f;
This math is wrong. Using the terminology of i representing the row and j representing the column, there are k values per row, therefore this should be:
a[i*k + j] = 1.0f;

CUDA array filtering kernel without a for loop

I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
Which we can then test using GTEST:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}

Equivalent of curand for OpenCL

I am looking at switching from nvidia to amd for my compute card because I want double precision support. Before doing this I decided to learn opencl on my nvidia card to see if I like it. I want to convert the following code from CUDA to OpenCL. I am using the curand library to generate uniformly and normally distributed random numbers. Each thread needs to be able to create a different sequence of random numbers and generate a few million per thread. Here is the code. How would I go about this in OpenCL. Everything I have read online seems to imply that I should generate a buffer of random numbers and then use that on the gpu but this is not practical for me.
template<int NArgs, typename OptimizationFunctor>
__global__
void statistical_solver_kernel(float* args_lbounds,
float* args_ubounds,
int trials,
int initial_temp,
unsigned long long seed,
float* results,
OptimizationFunctor f)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= trials)
return;
curandState rand;
curand_init(seed, idx, 0, &rand);
float x[NArgs];
for(int i = 0; i < NArgs; i++)
{
x[i] = curand_uniform(&rand) * (args_ubounds[i]- args_lbounds[i]) + args_lbounds[i];
}
float y = f(x);
for(int t = initial_temp - 1; t > 0; t--)
{
float t_percent = (float)t / initial_temp;
float x_prime[NArgs];
for(int i = 0; i < NArgs; i++)
{
x_prime[i] = curand_normal(&rand) * (args_ubounds[i] - args_lbounds[i]) * t_percent + x[i];
x_prime[i] = fmaxf(args_lbounds[i], x_prime[i]);
x_prime[i] = fminf(args_ubounds[i], x_prime[i]);
}
float y_prime = f(x_prime);
if(y_prime < y || (y_prime - y) / y_prime < t_percent)
{
y = y_prime;
for(int i = 0; i < NArgs; i++)
{
x[i] = x_prime[i];
}
}
}
float* rptr = results + idx * (NArgs + 1);
rptr[0] = y;
for(int i = 1; i <= NArgs; i++)
rptr[i] = x[i - 1];
}
The VexCL library provides an implementation of counter-based generators. You can use those inside larger expressions, see this slide for an example.
EDIT: Take this with a grain of sault, as I am the author of VexCL :).

CUDA: How to fill a vector of dynamic size on device and return its contents to another device function?

I want to know which is the proper technique to fill an dynamic size array on device (int *row, in the code bellow) and then return its content, to be used by another device function.
Aiming to contextualize the question, the code bellow attempt to span an arbitrary function in a basis set of Legendre polynomials using Gauss-Legendre quadratures running on the GPU.
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
__device__ double *d_droot, *d_dweight;
/*How could be returned the array or the pointer to the array int *row, on the device, that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int * row) {
int a[100][100];
int i, j;
//first row and first coloumn has the same value=1
for (i = 1; i <= n_row; i++) {
a[i][1] = a[1][i] = 1;
}
//Generate the full Triangle
for (i = 2; i <= n_row; i++) {
for (j = 2; j <= n_row - i; j++) {
if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
break;
}
a[i][j] = a[i - 1][j] + a[i][j - 1];
}
}
row = new int[n_row];
for (i = 1; i <= n_row; i++) {
row[i] = a[i][n_row-1];
}
}
__device__
double Legendre_poly(int order, double x)
{
int n,k;
double val=0;
int *binomials;
for(n=order; n>=0; n--)
{
Pascal_Triangle(n, binomials); /*Here are the problems*/
for(k=0; k<=n; k++)
val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
}
return val;
}
__device__ __host__
double f(double alpha,double x)
{
/*function expanded on a basis of Legendre palynomials. */
return exp(-alpha*x*x);
}
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
coefficients[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2)*c1;
}
}
int main(void)
{
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
// Perform the expansion
Span<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_dresult);
cudaFree(d_droot_temp);
cudaFree(d_dweight_temp);
}
and here is the makefile to compile the code above:
objects = main.o
all: $(objects)
nvcc -arch=sm_20 $(objects) -o span
%.o: %.cpp
nvcc -x cu -arch=sm_20 -I. -dc $< -o $#
clean:
rm -f *.o span
Thanks in advance for any suggestions.
(sorry my previous answer was off-base)
You are passing a row pointer to this function:
void Pascal_Triangle(int n_row, int * row) {
You are then attempting to overwrite this pointer with a new value:
row = new int[n_row];
Once you return from this function, row in the calling environment will be unmodified. (This is an ordinay C/C++ issue, not specific to CUDA.)
This is perhaps a confusing issue, but the pointer value of row is passed by value to the function Pascal_Triangle. You cannot modify the pointer value in the function, and expect the modified value to show up in the calling environment. (You can modify the contents of the locations that the pointer points to, which would be the usual reason to pass row by pointer.)
There are a few ways to fix this issue. The simplest might be just to pass the pointer by reference:
void Pascal_Triangle(int n_row, int * &row) {
Your code seems to have other defects in it. I would suggest that you employ proper cuda error checking and also run your code with cuda-memcheck.
In particular, the in-kernel new operator behaves in a similar fashion to in-kernel malloc, and it has similar limitations.
You are running out of device heap space, so many of your new operations are failing, and returning a NULL pointer.
As a test for this, it's good debug practice to put a line like this after your new operation:
if (row == NULL) assert(0);
(you'll also need to include assert.h)
If you do that, you'll find that this assert is being hit.
I haven't calculated how much device heap space your code actually needs, but it appears to be using quite a bit. In C++, it's customary to delete an allocation made by new once you're done with it. You might want to investigate freeing the allocations done with new, or else (even better) re-use the allocation (i.e. allocate it once per thread), and avoid the reallocation altogether.
here's a modification to your code that demonstrates the above (one allocation per thread) and compiles and runs without error for me:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
__device__ double *d_droot, *d_dweight;
/*How could be returned the array or the pointer to the array int *row, on the device, that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int *row) {
int a[100][100];
int i, j;
//first row and first coloumn has the same value=1
for (i = 1; i <= n_row; i++) {
a[i][1] = a[1][i] = 1;
}
//Generate the full Triangle
for (i = 2; i <= n_row; i++) {
for (j = 2; j <= n_row - i; j++) {
if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
break;
}
a[i][j] = a[i - 1][j] + a[i][j - 1];
}
}
for (i = 1; i <= n_row; i++) {
row[i] = a[i][n_row-1];
}
}
__device__
double Legendre_poly(int order, double x, int *my_storage)
{
int n,k;
double val=0;
int *binomials = my_storage;
if (binomials == NULL) assert(0);
for(n=order; n>=0; n--)
{
Pascal_Triangle(n, binomials); /*Here are the problems*/
for(k=0; k<=n; k++)
val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
}
return val;
}
__device__ __host__
double f(double alpha,double x)
{
/*function expanded on a basis of Legendre palynomials. */
return exp(-alpha*x*x);
}
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
#define MY_LIM 5
int *thr_storage = new int[MY_LIM];
if (thr_storage == NULL) assert(0);
coefficients[i] = 0.0;
for (dummy = 0; dummy < MY_LIM; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
delete thr_storage;
}
}
int main(void)
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576ULL*1024));
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
// Perform the expansion
Span<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_dresult);
cudaFree(d_droot_temp);
cudaFree(d_dweight_temp);
}
This code has a couple advantages:
it can run with a much smaller reservation on the device heap
it's considerably quicker than the vast number of allocations that your code was trying to do.
EDIT:
instead of the assert you could do something like this:
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
#define MY_LIM 5
int *thr_storage = new int[MY_LIM];
if (thr_storage == NULL) printf("allocation failure!\");
else {
coefficients[i] = 0.0;
for (dummy = 0; dummy < MY_LIM; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
delete thr_storage;
}
}
}

3D Elementwise Matrix Multiplication in CUDA?

I have a 2D Matrix Multiplication program using the following kernel:
__global__ void multKernel(int *a, int *b, int *c, int N)
{
int column = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
int index = row * N + column;
if(column < N && row < N)
{
c[index] = a[index] * b[index];
}
}
Now, I'd like to create a 3D matrix multiplication kernel, but I'm having trouble finding examples of how do create one (also, I'm terrible at reading mathematical formulae, it's something I need to improve on).
I know the GPU example will involve using
threadIdx.z
and so on, but I'm a bit lost with how to do it.
Could anyone point me in the right direction to either some formulae or sample code? Or even provide a basic example? I have a CPU example which should work, I think.
void matrixMult3D(int *a, int *b, int *c, int *z, int N)
{
int index;
for(int column = 0; column < N; column++)
{
for(int row = 0; row < N; row++)
{
for (int z = 0; z < N; z++)
{
index = row * N + column + z;
c[index] = a[index] * b[index] * z[index];
}
}
}
}
Am I at least on the right track?
Because what you are actually doing is just an element-wise product (I hesitate to call it a Hadamard Product because that isn't defined for hyper matrices AFAIK), you don't need to do anything differently from the simplest 1D version of your kernel code. Something like this:
template<int ndim>
__global__ void multKernel(int *a, int *b, int *c, int *z, int N)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
int idxmax = 1;
#pragma unroll
for(int i=0; i < ndim; i++) {
idxmax *= N;
}
for(; idx < idxmax; idx+=stride) {
c[index] = a[index] * b[index] * z[index];
}
}
[disclaimer: code written in browser, never compiled or run. use at own risk]
would work for any dimension of array with dimensions N (ndim=1), N*N (ndim=2), N*N*N (ndim=3), etc.