How to make my Cuda kernel run on bigger matrixes? - c++

So my code is suppsed to work like this:
-take in_martix of NxN elements and R factor
-it should give back a matrix of size [N-2R]*[N-2R] with each element being a sum of in_matrix elements in R radius it should work like this for N=4 R=1
Even though my code works for smaller matrixes, for bigger ones like 1024 or 2048 or even bigger R factors it gives back a matrix of 0's. Is it a problem inside my code or just my GPU can't compute more calculations ?
Code: (for testing purposes initial matrix is filled with 1's so every element of out_matrix should == (2R+1)^2
#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
#include <cuda_profiler_api.h>
#define N 1024
#define R 128
#define K 1
#define THREAD_BLOCK_SIZE 8
using namespace std;
__global__ void MatrixStencil(int* d_tab_begin, int* d_out_begin, int d_N, int d_R, int d_K) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int out_local = 0;
for (int col = tx; col <= tx + 2 * d_R ; col++)
for (int row = ty; row <= ty + 2 * d_R ; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
void random_ints(int tab[N][N]) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
tab[i][j] = 1;
}
int main() {
static int tab[N][N];
random_ints(tab);
int tab_size = sizeof(int) * N * N;
int out_size = sizeof(int) * (N - 2 * R) * (N - 2 * R);
dim3 BLOCK_SIZE(THREAD_BLOCK_SIZE, THREAD_BLOCK_SIZE);
dim3 GRID_SIZE(ceil((float)N / (float)(THREAD_BLOCK_SIZE )), ceil((float)N / (float)(THREAD_BLOCK_SIZE )));
void** d_tab;
void** d_out;
cudaMalloc((void**)&d_tab, tab_size);
cudaMalloc((void**)&d_out, out_size);
cudaMemcpyAsync(d_tab, tab, tab_size, cudaMemcpyHostToDevice);
int* d_tab_begin = (int*)(d_tab);
int* d_out_begin = (int*)(d_out);
MatrixStencil << < GRID_SIZE, BLOCK_SIZE>> > (d_tab_begin, d_out_begin, N, R, K);
int* out = (int*)malloc(out_size);
cudaMemcpyAsync(out, d_out, out_size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int col = 0; col < N - 2 * R; col++)
{
for (int row = 0; row < N - 2 * R; row++)
{
cout << *(out + ((col * (N - 2 * R)) + row)) << " ";
}
cout << endl;
}
}

Finally thanks to Robert I found how to make the code work - by adding if statment
if ((tx < d_N - 2 * d_R) && (ty < d_N - 2 * d_R)) {
for (int col = tx; col <= tx + 2 * d_R; col++)
for (int row = ty; row <= ty + 2 * d_R; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}

Related

CUDA array filtering kernel without a for loop

I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
Which we can then test using GTEST:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}

Problem in converting the "for loop" in CUDA

I have tried to extract patches from an image parallelly with pixel shift/overlapping. I have written the CPU version of the code. But I could not able to convert the for loop which has an increment of pixel shift. I have given the part of the code where for loop is being used. CreatePatchDataSet function has the "for loop " which has an increment of pixel shift. Please help me out to convert this function into Cuda. I have provided the following code.
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>
using namespace std;
using namespace cv;
#define PATCH_SIZE (5)
#define PIXEL_SHIFT (2)
void ConvertMat2DoubleArray(cv::Mat input, double* output)
{
for (int i = 0; i < input.rows; i++)
{
double *src = input.ptr<double>(i);
for (int j = 0; j < input.cols; j++)
{
output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
}
}
}
void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {
*num_of_patch_col = 0;
int len_nb = 0;
while (len_nb < width) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_col)++;
}
len_nb = 0;
*num_of_patch_row = 0;
while (len_nb < height) {
if (len_nb != 0) {
len_nb += patch_size - (patch_size - pixel_shift);
}
else {
len_nb += patch_size;
}
(*num_of_patch_row)++;
}
*num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
}
void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int counter_row = 0;
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
for (int i = 0; i < height; i += pixel_shift) {
int counter_col = 0;
for (int j = 0; j < width; j += pixel_shift) {
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0,
cv::INTER_LANCZOS4);
double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
free(orgimageH);
free(FY);
return 0;
}
The results I got for first 10 values in CPU version:
patch numbers:
16129
238,240,240,235,237,230,227,229,228,227
I have tried to convert this function to Kernel function using cuda:. But it goes into the infinite loop. As I am very new to this CUDA field, could you please help me to find out the problem in the code ?
__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
int num_of_patch_image = num_of_patch_row * num_of_patch_col;
int i = threadIdx.x + (blockDim.x*blockIdx.x);
int j = threadIdx.y + (blockDim.y*blockIdx.y);
while (i<height && j< width)
{
int counter_row = 0;
int counter_col = 0;
//Get Low Resolution Image
for (int ii = 0; ii < patch_size; ii++) {
for (int jj = 0; jj < patch_size; jj++) {
if ((i + ii) < height && (j + jj) < width) {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
}
else {
patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
}
}
}
counter_col++;
if (counter_col == num_of_patch_col) {
break;
}
counter_row++;
if (counter_row == num_of_patch_row) {
break;
}
}
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
}
int main()
{
int ratio=2;
cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
ConvertMat2DoubleArray(imageH, orgimageH);
int widthH = imageH.cols;
int heightH = imageH.rows;
//
int dimH = (int)PATCH_SIZE * (int)PATCH_SIZE* (int)image.channels();
int dimL = (int)PATCH_SIZE/ratio* (int)PATCH_SIZE/ratio * (int)image.channels();
//3. Create training data set=========================
int num_of_patch_image = 0;
int num_of_patch_col = 0;
int num_of_patch_row = 0;
GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
cout<<"patch numbers: \n " << num_of_patch_image << endl;
double* FY = (double*)calloc(dimH * num_of_patch_image, sizeof(double));
double *d_orgimageH;
gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));
double *d_FY;
gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH * num_of_patch_image));
gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH , sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT, (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH * num_of_patch_image, cudaMemcpyDeviceToHost));
// cout<<"Hello world";
free(orgimageH);
free(FY);
cudaFree(d_FY);
cudaFree(d_orgimageH);
return 0;
}
Image I have used: [1]: https://i.stack.imgur.com/Ywg7p.png
i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;
is outside the while loop in your kernel. As i and j never change inside the while loop, it isn't stopping. There could be more problems here, but this is the most prominent one.
EDIT: Another one that I found, is that you have only one while over both i and j instead of one for each. You should probably use for loops like in your CPU code:
for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x));
i < height;
i += pixel_shift * blockDim.x * gridDim.x) {
for (j = ...; j < ...; j += ...) {
/* ... */
}
}
EDIT 2:
I could imagine this to be a good idea:
for (counter_row = threadIdx.y + blockDim.y * blockIdx.y;
counter_row < num_of_patch_row;
counter_row += blockDim.y * gridDim.y) {
i = counter_row * pixel_shift;
if (i > height)
break;
for (counter_col = threadIdx.x + blockDim.x * blockIdx.x;
counter_col < num_of_patch_col;
counter_col += blockDim.x * gridDim.x) {
j = counter_col * pixel_shift;
if (j > width)
break;
/* ... */
}
}
I have also exchanged the x/y fields of the execution parameters between the inner and the outer loop, as it seemed more appropriate considering that the x field is continuous in warps (memory access benefits).

if statement runtime error

I originally had 3 equations: Pu, Pm & Pd. It ran fine.
Once I introduced the if statement, with variations on the 3 equations, depending on the loop iteration, I receive a runtime error.
Any help would be appreciated.
Cheers in advance.
#include <cmath>
#include <iostream>
#include <vector>
#include <iomanip>
int Rounding(double x)
{
int Integer = (int)x;
double Decimal = x - Integer;
if (Decimal > 0.49)
{
return (Integer + 1);
}
else
{
return Integer;
}
}
int main()
{
double a = 0.1;
double sigma = 0.01;
int delta_t = 1;
double M = -a * delta_t;
double V = sigma * sigma * delta_t;
double delta_r = sqrt(3 * V);
int count;
double PuValue;
double PmValue;
double PdValue;
int j_max;
int j_min;
j_max = Rounding(-0.184 / M);
j_min = -j_max;
std::vector<std::vector<double>> Pu((20), std::vector<double>(20));
std::vector<std::vector<double>> Pm((20), std::vector<double>(20));
std::vector<std::vector<double>> Pd((20), std::vector<double>(20));
std::cout << std::setprecision(10);
for (int i = 0; i <= 2; i++)
{
count = 0;
for (int j = i; j >= -i; j--)
{
count = count + 1;
if (j = j_max) // Exhibit 1C
{
PuValue = 7.0/6.0 + (j * j * M * M + 3 * j * M)/2.0;
PmValue = -1.0/3.0 - j * j * M * M - 2 * j * M;
PdValue = 1.0/6.0 + (j * j * M * M + j * M)/2.0;
}
else if (j = j_min) // Exhibit 1B
{
PuValue = 1.0/6.0 + (j * j * M * M - j * M)/2.0;
PmValue = -1.0/3.0 - j * j * M * M + 2 * j * M;
PdValue = 7.0/6.0 + (j * j * M * M - 3 * j * M)/2.0;
}
else
{
PuValue = 1.0/6.0 + (j * j * M * M + j * M)/2.0;
PmValue = 2.0/3.0 - j * j * M * M;
PdValue = 1.0/6.0 + (j * j * M * M - j * M)/2.0;
}
Pu[count][i] = PuValue;
Pm[count][i] = PmValue;
Pd[count][i] = PdValue;
std::cout << Pu[count][i] << ", ";
}
std::cout << std::endl;
}
return 0;
}
You are assigning instead of checking for equal: j_max to j in your if statements.
if (j = j_max)
// ^
else if (j = j_min)
// ^
Change if (j = j_max) to if (j == j_max),
And else if (j = j_min) to else if (j == j_min).
Correct the following if conditional check and all other instances of an if check
if(j=j_max)
with
if (j == j_max)
you are checking for an equality not assigning.
Your code was going into an infinite loop.

"while"/"for" loop in kernel causing CUDA out of memory error?

If I change the while loop (see kernel below, it's a monstrous loop, you can't miss it) to iterate only once, it uses a negligible amount of GPU memory. However, when the loop is allowed to iterate 50,000 times as shown below, the GPU instantly takes on 2.5 GB. The problem persists even when using a "for" loop. Can someone please offer an explanation and perhaps a solution to prevent the kernel from using so much memory? This behavior is highly unusual, IMO. Thanks in advance!
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
__global__ void myKern(const float *transMatrix, float *masterForces, const double *rands, const int r_max)
{
const int iterationsx = 50000;
const int RUsizex = 26;
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
int RU[RUsizex] = {0};
int index = 0;
float r = 0.0;
double temp = 0;
float forces[iterationsx] = {0.0};
int left[RUsizex - 2] = {0};
int right[RUsizex - 2] = {0};
curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
int i= 0;
while( i < iterationsx)
{
for(int k = 0; k < RUsizex - 2; k++)
{
left[k] = RU[k];
right[k] = RU[k+2];
}
for(int j = 0; j < RUsizex -2; j++)
{
r = curand_uniform(&s);
index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ;
RU[j + 1]= (RU[j + 1]) + ( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;
}
for(int z = 1; z < RUsizex - 1; z++)
{
temp = temp + (RU[z] ==4) + (RU[z] ==5);
}
forces[i] = temp/(24.0);
temp = 0.0;
i++;
}
for(int y = 0; y < iterationsx; y++)
{
masterForces[globalIdx + (r_max * y)] = forces[y];
}
}
The variable float forces[iterationsx] is a stack variable in a global function. This requires a stack reservation of > 200,000B per thread. The CUDA driver must allocate a local memory allocation based upon the maximum resident threads using the formula SmCount * MaxTheadsPerSm * (LocalMemoryPerThread + StackPerThread). For a full GK110 this would be 15 * 2048 * ~51KiB = 1.5 GiB.

Multiply two-dimensional matrices.‏ with pycuda

how can I iterate in two arrays?
__global__ void euclidean(float *x, float *y, int dim_x, int dim_y, int ms, float *solution) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int idy = threadIdx.y + blockDim.y * blockIdx.y;
float result = 0.0;
for (int iter = 0; iter < ms; iter++) {
float x_e = x[idy * ms + iter];
float y_e = y[idx * ms + iter];
result += (x_e * y_e);
}
}
Input: X = [[1,2], [3,4], [5,6], [7,8], [9,10]] and Y = [[0,0], [1,1]]
Expected Output: [[0, 3], [0, 7], [0, 11], [0, 15]. [0, 19]]
How can I do this? My difficulty is to iterate on X and Y.
Expected:
[idx: 0 idy: 0 = 0] [idx: 1 idy: 0 = 3] [idx: 2 idy: 0 = 0] [idx: 3
idy: 0 = 7] [idx: 4 idy: 0 = 0] [idx: 0 idy: 1 = 11] [idx: 1 idy: 1 =
0] [idx: 2 idy: 1 = 15] [idx: 3 idy: 1 = 0] [idx: 4 idy: 1 = 19]
I would do the following to multiply 2 matrices. This handles boundary conditions so should work on any grid/block size.
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
float cValue = 0;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < numCRows) && (Col < numCColumns)) {
for (int k = 0; k < numAColumns; k++) {
cValue += A[Row*numAColumns + k] * B[k*numBColumns + Col];
}
C[Row*numCColumns + Col] = cValue;
}
}
If you want a more efficient implementation you can also use the shared memory:
// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
__shared__ float ds_A[TILE_WIDTH_I][TILE_WIDTH_I];
__shared__ float ds_B[TILE_WIDTH_I][TILE_WIDTH_I];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float cValue = 0;
for (int m = 0; m < (numAColumns/TILE_WIDTH); m++) {
if (Row < numARows && m*TILE_WIDTH_I + tx < numAColumns) {
ds_A[ty][tx] = A[Row*numAColumns + m*TILE_WIDTH_I + tx];
} else {
ds_A[ty][tx] = 0;
}
if (m*TILE_WIDTH_I + ty < numBRows && Col < numBColumns) {
ds_B[ty][tx] = B[(m*TILE_WIDTH_I + ty)*numBColumns + Col];
} else {
ds_B[ty][tx] = 0;
}
__syncthreads();
if ((Row < numCRows) && (Col < numCColumns)) {
for (int k = 0; k < TILE_WIDTH; k++) {
cValue += ds_A[ty][k] * ds_B[k][tx];
}
}
__syncthreads();
}
if ((Row < numCRows) && (Col < numCColumns)) {
C[Row*numCColumns + Col] = cValue;
}
}