How do you allocate GPU memory in a separate CUDA function? - c++

I'm new to CUDA and sure that I'm doing something that's simple enough to fix, but I'm also not sure what to exactly search to find an answer. I've tried looking around but to no avail.
I have a few functions in my code that I want to perform matrix operations with, so instead of writing the code to allocate the memory multiple times, I want to use a function to do that for me. My issue is that the memory location is not being passed back to the function calling my MatrixInitCUDA function.
If I directly allocate the memory in my matrix functions it works as expected, but the issue I'm running into is that my pointer to device memory is only being assigned to the pointer inside of the MatrixInitCUDA function.
Initially I thought that there might have been some kind of type conversion of the arguments, so I included the typeinfo header and printed out the type of the device argument before and after cudaMalloc (no change - not surprising). I've tried passing in double pointers for the device matrix arguments but that doesn't seem to work either, although I'm not I did it properly either.
// Compile using nvcc <file> -lcublas -o <output>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <typeinfo>
// Define block size for thread allocation
#define BLOCK_DIM 32
#define N 10
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int A_height, A_width, B_height, B_width, C_height, C_width;
} MatrixSize;
void SetMatrixSize(MatrixSize *matrixSize,
unsigned int widthA, unsigned int heightA,
unsigned int widthB, unsigned int heightB,
unsigned int widthC, unsigned int heightC)
{
matrixSize->A_height = heightA;
matrixSize->A_width = widthA;
matrixSize->B_height = heightB;
matrixSize->B_width = widthB;
matrixSize->C_height = heightC;
matrixSize->C_width = widthC;
}
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *dev_matrixA, float *dev_matrixB, float *dev_matrixC)
{
// Assign CUDA variables
devID = 0;
cudaGetDevice(&devID);
cudaError_t err;
// Assign size variables
size_t matrixA_size = matrixSize->A_height * matrixSize->A_width * sizeof(float);
printf("Allocation size: %d\tMatrix Size: %d\n", (int) matrixA_size, matrixSize->A_height * matrixSize->A_width);
size_t matrixB_size = matrixSize->B_height * matrixSize->B_width * sizeof(float);
size_t matrixC_size = matrixSize->C_height * matrixSize->C_width * sizeof(float);
printf("PRE ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
// Allocate memory on GPU
err = cudaMalloc((void **) &dev_matrixA, matrixA_size);
printf("POST ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
printf("DEV A POST ALLOC: %p\n", dev_matrixA);
if (err != cudaSuccess) printf("Allocate matrix A: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixB, matrixB_size);
if (err != cudaSuccess) printf("Allocate matrix B: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixC, matrixC_size);
if (err != cudaSuccess) printf("Allocate matrix C: %s\n", cudaGetErrorString(err));
// Copy data from host PC to GPU
err = cudaMemcpy(dev_matrixA, host_matrixA, matrixA_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix A to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixB, host_matrixB, matrixB_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix B to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixC, host_matrixC, matrixC_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix C to GPU: %s\n", cudaGetErrorString(err));
}
int main(int argc, char **argv)
{
// Create memory for Layer 1, Layer 2, Layer 3 vectors
// float *layer1 = malloc(784*sizeof(floats)))
// Create memory for Weight 1->2, Weight 2->3 matrices
// Layer 1 will read from file for input (X) values
// Layer 2 and 3 will be calculated
int devID = 0;
cudaGetDevice(&devID);
// Testing hadamard product, init function, and set matrix size function
float *host_A, *host_B, *host_C, *dev_A = NULL, *dev_B = NULL, *dev_C = NULL;
MatrixSize *mallocTest = (MatrixSize *) calloc(sizeof(MatrixSize), 1);
size_t calcSize = N * N * sizeof(float);
host_A = (float *) calloc(calcSize, 1);
host_B = (float *) calloc(calcSize, 1);
host_C = (float *) calloc(calcSize, 1);
SetMatrixSize(mallocTest, N, N, N, N, N, N);
printf("DEV A PRE ALLOC: %p\n", dev_A);
// Initialize memory on GPU
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
dev_A, dev_B, dev_C);
printf("DEV A POST INIT: %p\n", dev_A);
return 0;
}
Here's the output I get if I compile and run this code:
DEV A PRE ALLOC: (nil)
Allocation size: 400 Matrix Size: 100
PRE ALLOC TYPE: Pf
POST ALLOC TYPE: Pf
DEV A POST ALLOC: 0x10208400000
DEV A POST INIT: (nil)

There are multiple ways using which the desired behavior can be achieved.
Method 1
One of the ways is to modify the MatrixInitCUDA arguments to accept double pointers (**) for device pointers and modify the code as follows:
Modify the function signature:
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float **dev_matrixA, float **dev_matrixB, float **dev_matrixC)
{
}
Allocate device memory as follows inside MatrixInitCUDA:
err = cudaMalloc((void **) dev_matrixA, matrixA_size);
Call MatrixInitCUDA from main like this:
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
&dev_A, &dev_B, &dev_C);
Method 2
My personal favorite way is that don't do any of the above and just modify the function signature to accept references for device pointers as follows:
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *&dev_matrixA, float *&dev_matrixB, float *&dev_matrixC)
{
}

Related

What causes this segmentation fault (core dumped) error at cudaMemcpy when copying to GPU?

I have been trying to fix segmentation fault (core dumped) error messages with a toy program when calling cudaMemcpy. It works for small images, but for bigger images it normally fails; I say normally, because it has sometimes succeeded when debugging with valgrind (more about that below). I have looked at similar questions, but have been unable to find the answer; sorry if this is a duplicate! I am just learning out (and following programming massively parallel processors).
Here is my code, cleaned up:
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include "opencv2/imgproc/imgproc.hpp"
#include <cuda.h>
#include <iostream>
#include <cuda_runtime.h>
using namespace cv;
using namespace std;
__global__ void
colorToGreyKernel(unsigned char* outPic, unsigned char* inPic, unsigned int width, unsigned int height){
// printf("trying \n" );
int Col = blockDim.x * blockIdx.x + threadIdx.x;
int Row = blockDim.y * blockIdx.y + threadIdx.y;
if( Col < width && Row < height){
int greyOffset = Row * width + Col;
int rgbOffset = greyOffset * 3;
unsigned char b = inPic[rgbOffset];
unsigned char g = inPic[rgbOffset +1];
unsigned char r = inPic[rgbOffset +2];
outPic[greyOffset] = 0.21f*r + 0.71f*g + 0.07f*b;
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
bool test = code == cudaSuccess;
// cout << "code " << std::boolalpha<< test;
if (code != cudaSuccess)
{
// const char *errorStr = NULL;
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char** argv )
{
if ( argc != 2 )
{
printf("usage: DisplayImage.out <Image_Path>\n");
return -1;
}
Mat image;
unsigned int imSize[2] = {400,400};
unsigned char* inPic = NULL;
unsigned char* outPic = NULL;
gpuErrchk(cudaMalloc(&inPic, imSize[0] * imSize[1] * 3 * sizeof(CV_8U)));
gpuErrchk(cudaMalloc(&outPic, imSize[0] * imSize[1] * sizeof(CV_8U)));
image = imread( argv[1], IMREAD_COLOR );
resize(image, image, Size(imSize[0],imSize[1]));
Mat greyImg(image.rows, image.cols, CV_8U, Scalar(125));
size_t size = image.cols * image.rows * image.channels() * sizeof(CV_8U);
// This is where it always fails for bigger images
gpuErrchk(cudaMemcpy(inPic,(void*) &image.data[0], size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(outPic, (void*)&greyImg.data[0], size/3, cudaMemcpyHostToDevice));
dim3 dimGrid(ceil(image.rows/16.0),ceil(image.cols/16.0),1);
dim3 dimBlock(16,16,1);
colorToGreyKernel<<<dimGrid, dimBlock>>>(outPic, inPic,(int) image.rows,(int) image.cols);
cudaDeviceSynchronize();
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(greyImg.data, outPic, size / 3, cudaMemcpyDeviceToHost));
namedWindow("Display Image", WINDOW_AUTOSIZE );
imshow("Display Image", greyImg);
waitKey(0);
cudaFree(&inPic[0]);
cudaFree(&outPic[0]);
return 0;
}
I'm able to allocate on the device, but the copying fails for bigger images. I've tried it using opencv::cuda, and I can load any picture and do cvtColor on the device without resizing, so I conclude it's not memory (similar when looking at nvidia-smi).
When I run using valgrind, I get a lot of Invalid write of size 8 errors around this point, all referencing to libcuda. I know it's this particular memcopy that's the problem, by isolating it. Sometimes it also works in valgrind, but I've gathered that this is normal. I don't have experience with valgrind yet, but the memory issues don't make sense to me (I'm trying to copy to the device, so why a segmentation fault which is related to the host?).
My question is simple, where does the error come from and how to fix this?
NVCC = 11.1
gpu = GeForce GTX 960M (not a lot, but that shouldn't matter)
Again, I am new to programming in Cuda, but have tried what I can think of and can not isolate the problem! Thanks for your help.
The problem here relates to your usage of OpenCV. An item like CV_8U is not a type, it is a compiler #define. Therefore sizeof(CV_8U) is not doing what you think it is doing. Your intended usage should be to capture the size of the underlying type (e.g. unsigned char, i.e. a type size of 1). However, sizeof(CV_8U) returns evidently the size of an integer, which is 4.
As a result of that, your calculation of size is wrong (4x too large). As a result of that, when the cudaMemcpy operation attempts to access &image.data[0] for size bytes, it will attempt to copy past the end of the buffer. For small images, the overrun doesn't trigger the run time check/limit. For a large enough size calculation (large enough image) you will hit a seg fault. Although the failure is triggered within a CUDA call, the origin of the error is outside of CUDA.
One possible solution is to replace your usage of sizeof(CV_8U) with something like sizeof(unsigned char). Since that size is 1, you can also just delete the multiplication by sizeof(CV_8U) and get the same behavior.
You can also avoid this sort of allocation and let OpenCV do the allocation (and host-device data copying) work for you as demonstrated in the answer here and here

C++ class dll with CUDA member?

I have a C++ class-based dll. I'd like to convert some of the class members to CUDA based operation.
I am using VS2012, WINDOWS 7, CUDA6.5, sm_20;
Say the original SuperProjector.h file is like:
class __declspec(dllexport) SuperProjector
{
public:
SuperProjector(){};
~SuperProjector(){};
void sumVectors(float* c, float* a, float* b, int N);
};
and the original sumVector() function in SuperProjector.cpp
void SuperProjector::sumVectors(float* c, float* a, float* b, int N)
{
for (int n = 1; n < N; b++)
c[n] = a[n] + b[n];
}
I am stuck on how I should convert sumVector() to CUDA. Specifically:
I read some posts saying add __global__ __device__ keywords in front
of class members will work, but so I need to change the suffix of
the cpp file to cu?
I also tried to create a cuda project from the beginning, but it seems VS2012 does not give me the option of creating a dll once I chose to create a CUDA project.
I am very confused what is the best way to convert some of the members of tthis C++ class based dll into some CUDA kernel functions. I appreciate anyone can offer some ideas, or better with some very simple examples.
Create CUDA project, let's call it cudaSuperProjector and add two files SuperProjector.cu and SuperProjector.h
cudaSuperProjector.h
class __declspec(dllexport) cudaSuperProjector {
public:
cudaSuperProjector(){ }
~cudaSuperProjector(){ }
void sumVectors(float* c, float* a, float* b, int N);
};
cudaSuperProjector.cu
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cudaSuperProjector.h"
__global__ void addKernel(float *c, const float *a, const float *b) {
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int size) {
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float));
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel << <1, size >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);
return cudaStatus;
}
void cudaSuperProjector::sumVectors(float* c, float* a, float* b, int N) {
cudaError_t cudaStatus = addWithCuda(c, a, b, N);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSuperProjector::sumVectors failed!");
}
}
Note: In properties of file cudaSuperProjector.cu Item Type should be CUDA C/C++.
Go to properties of the project and in General set value of Configuration Type to Dynamic Library (.dll). Now everything for creating library is ready. Compile this project and in output folder you will find cudaSuperProjector.dll and cudaSuperProjector.lib. Create directory cudaSuperProjector\lib and copy cudaSuperProjector.dll and cudaSuperProjector.lib there. Also create cudaSuperProjector\include and copy cudaSuperProjector.h in it.
Create another Visual C++ project, let's call it SuperProjector. Add file SuperProjector.cpp to the project.
SuperProjector.cpp
#include <stdio.h>
#include "cudaSuperProjector/cudaSuperProjector.h"
int main(int argc, char** argv) {
float a[6] = { 0, 1, 2, 3, 4, 5 };
float b[6] = { 1, 2, 3, 4, 5, 6 };
float c[6] = { };
cudaSuperProjector csp;
csp.sumVectors(c, a, b, 6);
printf("c = {%f, %f, %f, %f, %f, %f}\n",
c[0], c[1], c[2], c[3], c[4], c[5]);
return 0;
}
In properties of the project add path to the dll and lib files to the VC++ Directories -> Library Directories, for example D:\cudaSuperProjector\lib;, in VC++ Directories -> Include Directories add path to the header, for example D:\cudaSuperProjector\include;. Then go to the Linker -> Input and add cudaSuperProjector.lib;.
Now your project should compile fine, but when you run it it will show you the error
The program can't start because cudaSuperProjector.dll is missing from
your computer. Try reinstalling the program to fix this problem.
You need to copy cudaSuperProjector.dll to the output folder of the project, so it will be under the same folder as SuperProjector.exe. You can do it manually or add
copy D:\cudaSuperProjector\lib\cudaSuperProjector.dll $(SolutionDir)$(Configuration)\
in Build Events -> Post-Build Events -> Command Line,
where $(SolutionDir)$(Configuration)\ is output path for solution (see Configuration Properties -> General -> Output Directory).

Cuda "invalid argument" 2d array - Cellular automata

I'm trying to calculate 2d cellular automata redistribution using Cuda. I'm completely new to it so I have no idea what I do wrong. I've tried many solutions that I've seen here but all give "invalid argument" when I call the kernel.
Here is a simplified version of the kernel:
//kernel definition
__global__ void stepCalc(float B[51][51], int L, int flag, float m, float en)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
flag = 0;
if (i < L-2 && j < L-2 && i>2 && j>2 && abs(g)>m)
{
flag = 1;
en+=-16*g*g+8*B[i][j]*abs(g);
B[i][j]+=-4*f*g;
B[i+1][j]+=f*g;
B[i-1][j]+=f*g;
B[i][j+1]+=f*g;
B[i][j-1]+=f*g;
}
}
The main function looks like this:
#define L 50
float B[L+1][L+1];
//initialize B[i][j]
float g=0;
int flag = 1;
float m=0.1;
float en = 0;
while (flag==1)
{
float (*dB)[L+1];
int *dFlag=NULL;
float *dEn=NULL;
cudaMalloc((void **)&dFlag,sizeof(int));
cudaMalloc((void **)&dEn,sizeof(float));
cudaMalloc((void **)&dB, ((L+1)*(L+1))*sizeof(float));
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(dFlag, &flag, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(dEn, &en, sizeof(float), cudaMemcpyDeviceToHost);
dim3 threadsPerBlock(16,16);
dim3 numBlocks((L+1)/threadsPerBlock.x,(L+1)/threadsPerBlock.y);
stepCalc<<<numBlocks, threadsPerBlock>>>(dB, L, dflag, m, dEn);
GPUerrchk(cudaPeekAtLastError()); //gives "invalid argument" at this line
cudaMemcpy(B, (dB), sizeB, cudaMemcpyDeviceToHost);
cudaMemcpy(&flag, dFlag, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&en, dEn, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dB);
cudaFree(dFlag);
cudaFree(dEn);
}
I need to extract the new array B, the flag value and the sum 'en' over all threads. Am I even close to how a solution should look? Is it even possible? I've also tried making the host array B as float** B with no luck.
There are various problems with your code.
You may be overlooking the difference between passing a value to a kernel and passing a pointer:
__global__ void stepCalc(float B[51][51], int L, int flag, float m, float en)
^ ^
| |
a pointer a value
we'll come back to B in a moment, but for values like flag and en, passing these by value to a kernel has similar implications to passing by value to a C function. It is a one-way communication path. Since it's evident from your code that you want to use these values modified by the kernel later in host code, you will need to pass pointers, instead. In a few cases, you have already allocated pointers for this purpose, so you have an additional type of error in that in some cases (dFlag) you are passing a pointer whereas the kernel definition expects a value.
Regarding B, passing a 2D array from host to device can be more difficult than you might initially expect, due to the deep copy problem. Without covering all that ground here, search on "CUDA 2D array" in the upper right hand corner of this page, and you'll get a lot of information about it and various ways to deal with it. Since you seem to be willing to consider an array of fixed width (known at compile-time), we can simplify the handling of a 2D array by leveraging the compiler to help us with a particular typedef.
When you're having trouble with a cuda code, it's good practice to do rigorous CUDA error checking throughout your code, not in just one place. One reason for this is that CUDA errors incurred in a particular place will often be returned at any subsequent place in the code. This makes it confusing if you don't check every CUDA API call, as a particular "invalid argument" error might not be due to the kernel itself, but some API call that occurred previously.
You typically don't want cudaMalloc operations in a data-processing while loop. These are normally operations you do once, at the beginning of your code. Doing the cudaMalloc at each iteration of the while-loop has several negative issues, one of which is that you will run out of memory (although you have cudaFree statements, so perhaps not), eventually, and you are effectively throwing away your data at each iteration. Also, it will negatively impact your performance.
You have some of your cudaMemcpy transfer directions wrong, like here:
cudaMemcpy(dFlag, &flag, sizeof(int), cudaMemcpyDeviceToHost);
Setting flag to zero in your kernel code will be problematic. Warps can execute in any order, and after some warps have already set flag to 1 later in the kernel, other warps could begin executing and set flag to zero again. This is probably not what you want. One possible fix is to set flag to zero before executing the kernel (i.e. in host code, and copy it to the device).
Your kernel will generate out-of-bounds indexing here:
float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
(just ask yourself what happens when i=0 and j=0). The fix for this is to move this line of code inside the if-check you have for bounds checking right after it.
Your kernel uses a variable f which is defined nowhere that I can see, for example here:
B[i+1][j]+=f*g;
The following code is my attempt to rework your code, create a complete example, and remove the above issues. It doesn't do anything useful, but it compiles without errors and runs without errors for me. I haven't provided any data, so it's just a proof-of-concept at this point. I'm sure it still contains data processing errors.
#include <stdio.h>
#define my_L 50
typedef float farray[my_L+1];
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
//kernel definition
__global__ void stepCalc(farray B[], int L, int *flag, float m, float *en)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
//float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
// flag = 0;
float f = 1.0f;
if (i < L-2 && j < L-2 && i>2 && j>2){
float g=B[i][j]-0.25*(B[i+1][j]+B[i-1][j]+B[i][j+1]+B[i][j-1]);
if (abs(g)>m)
{
*flag = 1;
*en+=-16*g*g+8*B[i][j]*abs(g);
B[i][j]+=-4*f*g;
B[i+1][j]+=f*g;
B[i-1][j]+=f*g;
B[i][j+1]+=f*g;
B[i][j-1]+=f*g;
}
}
}
int main(){
farray B[my_L+1];
//initialize B[i][j]
farray *dB;
int flag = 1;
float m=0.1;
float en = 0;
int *dFlag=NULL;
float *dEn=NULL;
cudaMalloc((void **)&dFlag,sizeof(int));
cudaCheckErrors("1");
cudaMalloc((void **)&dEn,sizeof(float));
cudaCheckErrors("2");
size_t sizeB = (my_L+1)*sizeof(farray);
cudaMalloc((void **)&dB, sizeB);
cudaCheckErrors("3");
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaCheckErrors("4");
cudaMemcpy(dEn, &en, sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("5");
dim3 threadsPerBlock(16,16);
dim3 numBlocks((my_L+1)/threadsPerBlock.x,(my_L+1)/threadsPerBlock.y);
while (flag==1)
{
flag = 0;
cudaMemcpy(dFlag, &flag, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("6");
stepCalc<<<numBlocks, threadsPerBlock>>>(dB, my_L, dFlag, m, dEn);
cudaDeviceSynchronize();
cudaCheckErrors("7");
cudaMemcpy(&flag, dFlag, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("8");
}
cudaMemcpy(B, (dB), sizeB, cudaMemcpyDeviceToHost);
cudaCheckErrors("9");
cudaMemcpy(&en, dEn, sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("10");
// process B
cudaFree(dB);
cudaFree(dFlag);
cudaFree(dEn);
}

CUDA "Unknown error" for unknown reasons [duplicate]

This question already has an answer here:
Copy an object to device?
(1 answer)
Closed 9 years ago.
In my current project, a call to cudaGetLastError() is returning unknown error and I don't know why. The code compiles just fine, but it is not behaving how I would like it to.
Below is a brief, not compilable example of what the relevant code consists of:
CU_Main.cu
Below is the CUDA kernel:
//My CUDA kernel
__global__ void CU_KernelTest(Kernel* matrix){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if(idx == 0 && idy == 0){
printf("ID is: %d\n", idx);
matrix->set(1,1, 16.0f);
}
}
Here is the host code:
//A host function which is called when a button is clicked
int HOST_OnbuttonClick(){
Kernel* matrix = new Kernel(3,3,2);
Kernel* device_matrix;
cudaMalloc(&device_matrix, sizeof(Kernel));
cudaMemcpy(device_matrix, matrix, sizeof(Kernel), cudaMemcpyHostToDevice);
CU_KernelTest<<<256, 256>>>(device_matrix);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
}
cudaFree(device_matrix);
return 0.0f;
}
When matrix->set(1,1, 16.0f); is included in the cuda kernel, (err != cudaSuccess) returns true and prints out UNKNOWN ERROR, whereas if I comment set out, i get no error.
The other struct relevant to this is my own helper for a convolution kernel design I'm going for, naturally called Kernel.
Kernel.cuh
struct Kernel {
private :
float* kernel;
int rows;
int columns;
public :
__device__ __host__
Kernel(int _rows, int _columns, float _default) {
rows = _rows;
columns = _columns;
kernel = new float[rows * columns];
for(int r = 0; r < rows; r++){
for(int c = 0; c < columns; c++){
kernel[r * rows + c] = _default;
}
}
}
__device__ __host__
void set(int row, int col, float value){
kernel[row * rows + col] = value;
}
}
The goal of this design is to be able to set all values for the kernel on the host, send it to the CUDA kernel, set values there and then retrieve the updated object back at the host.
So, there are two issues really, why would I get an unknown error message, and is the code syntactically correct that it should work?
Let me know if more information is needed.
Here are the results of the memory checker:
Nsight Debug
================================================================================
CUDA Memory Checker detected 1 threads caused an access violation:
Launch Parameters
CUcontext = 071c7340
CUstream = 08f3e3b8
CUmodule = 08fa97a8
CUfunction = 08fdbbe8
FunctionName = _Z13CU_KernelTestP6Kernel
gridDim = {1,1,1}
blockDim = {256,1,1}
sharedSize = 128
Parameters:
matrix = 0x06b60000 {kernel = 0x07a31718 ???, rows = 3, columns = 3}
Parameters (raw):
0x06b60000
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source
-----------------------------------------------------------------------------------------------
07a31728 4 adr st g 0 0 {0,0,0} {0,0,0} 000260 c:\users
Summary of access violations:
c:\users....kernel.cuh(26): error MemoryChecker: #misaligned=0 #invalidAddress=2
Your Kernel class contains a pointer. When you copy the class to the device, you have a host pointer on the device. Dereferencing that on the device gives you this invalid address access violation.
This seems to be a regular cause for confusion. Robert Crovella has just explained it yesterday.

CUDA - cudaMallocPitch and cudaMemcpy2D use, Error: InvalidValue, InvalidPitchValue

okay so I'm trying to get a 2D array for cuda to work on, but it's becoming a pain. the error's are in the title and occur at the cudaMemcpy2D. I think the problem is obvious to trained eyes. Thank you in advance for any help, I've stepped ahead of my class which are currently learning Pointers.
#include <cuda_runtime.h>
#include <iostream>
#pragma comment (lib, "cudart")
/* Program purpose: pass a 10 x 10 matrix and multiply it by another 10x10 matrix */
float matrix1_host[100][100];
float matrix2_host[100][100];
float* matrix1_device;
float* matrix2_device;
size_t pitch;
cudaError_t err;
__global__ void addMatrix(float* matrix1_device,float* matrix2_device, size_t pitch){
// How this works
// first we start to cycle through the rows by using the thread's ID
// then we calculate an address from the address of a point in the row, by adding the pitch (size of each row) and * it by
// the amount of rows we've already completed, then we can use that address of somewhere at a start of a row to get the colums
// in the row with a normal array grab.
int r = threadIdx.x;
float* rowofMat1 = (float*)((char*)matrix1_device + r * pitch);
float* rowofMat2 = (float*)((char*)matrix2_device + r * pitch);
for (int c = 0; c < 100; ++c) {
rowofMat1[c] += rowofMat2[c];
}
}
void initCuda(){
err = cudaMallocPitch((void**)matrix1_device, &pitch, 100 * sizeof(float), 100);
err = cudaMallocPitch((void**)matrix2_device, &pitch, 100 * sizeof(float), 100);
//err = cudaMemcpy(matrix1_device, matrix1_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
//err = cudaMemcpy(matrix2_device, matrix2_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
err = cudaMemcpy2D(matrix2_device, 100*sizeof(float), matrix2_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
}
void populateArrays(){
for(int x = 0; x < 100; x++){
for(int y = 0; y < 100; y++){
matrix1_host[x][y] = (float) x + y;
matrix2_host[y][x] = (float) x + y;
}
}
}
void runCuda(){
dim3 dimBlock ( 100 );
dim3 dimGrid ( 1 );
addMatrix<<<dimGrid, dimBlock>>>(matrix1_device, matrix2_device, 100*sizeof(float));
//err = cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy2D(matrix1_host, 100*sizeof(float), matrix1_device, pitch, 100*sizeof(float),100, cudaMemcpyDeviceToHost);
//cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
}
void cleanCuda(){
err = cudaFree(matrix1_device);
err = cudaFree(matrix2_device);
err = cudaDeviceReset();
}
int main(){
populateArrays();
initCuda();
runCuda();
cleanCuda();
std::cout << cudaGetErrorString(cudaGetLastError());
system("pause");
return 0;
}
First of all, in general you should have a separate pitch variable for matrix1 and matrix2. In this case they will be the same value returned from the API call to cudaMallocPitch, but in the general case they may not be.
In your cudaMemcpy2D line, the second parameter to the call is the destination pitch. This is just the pitch value that was returned when you did the cudaMallocPitch call for this particular destination matrix (ie. the first parameter).
The fourth parameter is the source pitch. Since this was allocated with an ordinary host allocation, it has no pitch other than its width in bytes.
So you have your second and fourth parameters swapped.
so instead of this:
err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
try this:
err = cudaMemcpy2D(matrix1_device, pitch, matrix1_host, 100*sizeof(float), 100*sizeof(float), 100, cudaMemcpyHostToDevice);
and similarly for the second call to cudaMemcpy2D. The third call is actually OK since it's going in the opposite direction, the source and destination matrices are swapped, so they line up with your pitch parameters correctly.