I intend to perform vector manipulations and was trying a small dummy program with vector addition and multiplication. However, the code does not run due to limitations on my knowledge on shared memory. All the sources in the internet show 2D matrix operations which I cannot translate to my vector problems. Please try to explain where am I going wrong considering the fact I am a novice in OpenCL. The code is given below:
Host Code:
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Context context;
cl::CommandQueue queue;
cl::Program program;
cl::Kernel kernel;
cl::Platform::get(&platforms);
deviceUsed = 0;
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(),0 };
context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = cl::CommandQueue(context, devices[deviceUsed]);
cl::Program::Sources source( 1, std::make_pair(kernel_source.c_str(), kernel_source.size()));
program = cl::Program(context, source);
program.build(devices);
std::vector < float > a;
std::vector < float > b;
std::vector < float > sum;
std::vector < float > prod;
int globalSize = 128;
int localSize = 16;
a.resize(globalSize);
b.resize(globalSize);
sum.resize(globalSize);
prod.resize(globalSize);
for (int i = 0; i < globalSize ; i++)
{
a[i] = 1.0f * i;
b[i] = 5.0f * i;
}
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_sum;
cl::Buffer buffer_prod;
buffer_A = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
buffer_B = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * globalSize , &a[0]);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * globalSize , &b[0]);
buffer_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
buffer_prod = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_sum);
kernel.setArg(3, buffer_prod);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize/localSize), cl::NDRange(N), NULL);
queue.finish();
queue.enqueueReadBuffer(buffer_sum, CL_TRUE, 0, sizeof(float) * globalSize, &sum[0]);
queue.enqueueReadBuffer(buffer_prod, CL_TRUE, 0, sizeof(float) * globalSize, &prod[0]);
Kernel:
#define STRINGI(ker) #ker
std::string kernel_source = STRINGI(
__kernel void KernelAddMul(__global float* a, __global float* b, __global float* sum, __global float* prod)
{
unsigned int j = get_local_id(0);
int N = get_local_size(0);
unsigned int i = N * get_global_id(0) + j;
float locSum[N];
float locProd[N];
__local float Asub[N];
__local float Bsub[N];
for(int k = 0; k < N; k++){
Asub[k] = a[i];
Bsub[k] = b[i];
barrier(CLK_LOCAL_MEM_FENCE);
locSum[k] = Asub[k] + Bsub[k];
locProd[k] = Asub[k] * Bsub[k];
barrier(CLK_LOCAL_MEM_FENCE);
sum[i] = locSum[k];
prod[i] = locProd[k];
}
}
);
I suspect that your code does not run because your kernel does not compile.
The following lines are invalid:
int N = get_local_size(0);
float locSum[N];
float locProd[N];
__local float Asub[N];
__local float Bsub[N];
N must be a constant, you cannot dynamically size the arrays using get_local_size(0).
I strongly recommend that you use a standalone compiler to compile your kernels:
CodeXL is very good, as is the Intel
SDK for OpenCL.
Anything is better than trying to debug your kernel in an application!
Related
I want to parallelize temperatures distribution, using OpenCL technology. I stocked on problem with my GPU - work item id for every other kernel function are the same. Instead of result, for example, from 0 to 1024, I got this result. What I did incorrectcly?
enter image description here
Source.cpp
include <iostream>
#include <string>
#include <fstream>
#include <omp.h>
#include <CL/cl.hpp>
float*** distributeOpenCL(float*** cuboid, int k, int m, int n)
{
// OpenCL init
int size = k * m * n;
float*** hResult = initCuboid(k, m, n);
cl_platform_id platform;
cl_device_id device;
cl_int error = 0;
std::ifstream file("program.cl");
std::string fileText = std::string(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>());
const char* srcText = fileText.data();
size_t srcLength = fileText.size();
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_mem dCuboid, dRes;
size_t localSize[2] = { k,m };
size_t globalSize[2] = { ceil(size / (float)localSize[0]) * localSize[0], ceil(size / (float)localSize[1]) * localSize[1] };
// Get GPU
error |= clGetPlatformIDs(1, &platform, NULL);
error |= clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Compile and build
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
program = clCreateProgramWithSource(context, 1, &srcText, &srcLength, &error);
error |= clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// What funtion from file we have to run
kernel = clCreateKernel(program, "distributeKernel", &error);
// Add to Queue
queue = clCreateCommandQueueWithProperties(context, device, NULL, &error);
// Create buffer
dCuboid = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
dRes = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
// Write data to buffer
error |= clEnqueueWriteBuffer(queue, dCuboid, CL_TRUE, 0, sizeof(float) * size, cuboid, 0, NULL, NULL);
// Kernel args
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &dCuboid);
error |= clSetKernelArg(kernel, 1, sizeof(int), &k);
error |= clSetKernelArg(kernel, 2, sizeof(int), &m);
error |= clSetKernelArg(kernel, 3, sizeof(int), &n);
error |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dRes);
// Start task
error |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
// Wait execution
clFinish(queue);
// Read Result
error |= clEnqueueReadBuffer(queue, dRes, CL_TRUE, 0, sizeof(float) * size, hResult, 0, NULL, NULL);
//printCuboid(resP, k, m, n, resPFile);
// Deallocation
clReleaseKernel(kernel);
clReleaseMemObject(dCuboid);
clReleaseMemObject(dRes);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return hResult;
}
int main(int argc, char* argv[])
{
std::ofstream filledFile("filled.txt");
std::ofstream resLFile("resL.txt");
std::ofstream resPFile("resP.txt");
double durationL, durationP, time1, time2;
int k = 5, m = 5, n = 5, temp1 = 10, temp2 = 15;
float*** cuboid, *** resL, *** resP;
if (argc > 1) {
k = atoi(argv[1]), m = atoi(argv[2]), n = atoi(argv[3]),
temp1 = atoi(argv[4]), temp2 = atoi(argv[5]);
}
// Linear
cuboid = initCuboid(k, m, n);
fillCuboid(cuboid, k, m, n, temp1, temp2);
printCuboidToFile(cuboid, k, m, n, filledFile);
time1 = omp_get_wtime();
resL = distribute(cuboid, k, m, n);
time2 = omp_get_wtime();
durationL = time2 - time1;
printCuboidToFile(resL, k, m, n, resLFile);
// Parallel
time1 = omp_get_wtime();
resP = distributeOpenCL(cuboid, k, m, n);
time2 = omp_get_wtime();
durationP = time2 - time1;
//printCuboidToFile(resP, k, m, n, resPFile);
std::cout << "Linear time: " << durationL << std::endl;
std::cout << "Parallel time: " << durationP << std::endl;
std::cout << "Parallel faster than linear on: " << durationL - durationP << std::endl;
// Delete 3d arrays, closing files
deleteCuboid(cuboid, k, m, n);
deleteCuboid(resL, k, m, n);
deleteCuboid(resP, k, m, n);
filledFile.close();
resLFile.close();
resPFile.close();
return 0;
}
program.cl
__kernel void distributeKernel(__global float*** cuboid, int k, int m, int n, __global float*** result)
{
int gz = get_global_id(0);
int gy = get_global_id(1);
printf("gy - %d \n", &gy);
printf("gz - %d \n", &gz);
bool isDissipated = false;
int size = k * m * n;
// Ends if temperatures in cube becomes balanced
while (!isDissipated) {
int dissipatedCount = 0;
for (int x = 0; x < n; x++) {
// Calc average temperature
float sum = 0;
int count = 0;
float average;
for (int zSum = gz - 1; zSum <= gz + 1; zSum++) {
for (int ySum = gy - 1; ySum <= gy + 1; ySum++) {
for (int xSum = x - 1; xSum <= x + 1; xSum++) {
if (zSum >= 0 && ySum >= 0 && xSum >= 0
&& zSum < k && ySum < m && xSum < n) {
count++;
sum += result[gz][gy][xSum];
}
}
}
}
average = round(sum / count * 100) / 100;
if (average == result[gz][gy][x]) {
dissipatedCount++;
}
else {
result[gz][gy][x] = average;
}
}
if (dissipatedCount == size) {
isDissipated = true;
}
}
}
To get the issue with the supposedly wrong get_global_id() fixed, start with a simple, minimal "Hello World"-style vector addition program and than advance forward to your temperature distribution application step-by-step.
With your code I see several issues:
You can only have 1D pointers (with a single *) in OpenCL.
__kernel void distributeKernel(__global float* cuboid, __global float* result)
Introduce a linear index to access more than 1 dimension: For 2D for example int n = x+y*get_global_size(0);
From what I see, k, m, n are lattice dimensions. Eliminate them from the kernel entirely. Get size via get_global_size(...).
The kernel looks rather complex with a lot of loops and branching. This could kill any performance benefit you hope to get from GPU parallelization. Get rid of loops and branching as far as possible. Also, there should not be any loop over one of the lattice dimensions since the lattice position is what you parallelize.
I would also advice to use only 1D parallelization in OpenCL and do the linear indexing yourself. This gives you more flexibility regarding workgroup size.
I developed a small code to add two small vector using GPU by OpenCL library. The main code vectorAdd.cc is as follows:
#include <iostream>
#include <CL/cl.hpp>
#include <cassert>
#include <fstream>
#include <time.h>
#include <cmath>
void randomInit(float *data, int size)
{
for (unsigned int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
int main()
{
//get all platforms (drivers)
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
assert(platforms.size() > 0);
cl::Platform myPlatform = platforms[0];
std::cout << "Using platform: "<<myPlatform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> devices;
myPlatform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
assert(devices.size() > 0);
cl::Device myDevice = devices[0];
std::cout<< "Using device: "<<myDevice.getInfo<CL_DEVICE_NAME>()<<"\n";
std::ifstream vectorAddFile("vector_add_kernel.cl" );
std::string src(std::istreambuf_iterator<char>(vectorAddFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Context context(myDevice);
cl::Program program(context, sources);
int szVec = 10;
float* A = new float[szVec];
float* B = new float[szVec];
randomInit(A,szVec);
randomInit(B,szVec);
float* C = new float[szVec];
std::fill_n(C, szVec, 0);
// create buffers on the device
cl::Buffer buffer_A = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), A);
cl::Buffer buffer_B = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), B);
cl::Buffer buffer_C = cl::Buffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), C);
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, myDevice);
//write arrays A and B to the device
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * szVec, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * szVec, B);
auto err = program.build("cl.std.CL1.2");
// run the kernel
cl::Kernel kernel(program,"vector_add", &err);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(szVec), cl::NullRange);
queue.finish();
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float) * szVec, C);
std::cout<<" result: \n";
for(int i = 0; i < szVec; i++)
{
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
}
std::cout << std::endl;
return 0;
}
and the kernel code vector_add_kernel.cl is as follows:
__kernel void vector_add(__global float *A, __global float *B, __global float *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
and the result i got is:
Using platform: NVIDIA CUDA
Using device: Tesla K20m
result:
0.840188 + 0.477397 = 0
0.394383 + 0.628871 = 0
0.783099 + 0.364784 = 0
0.79844 + 0.513401 = 0
0.911647 + 0.95223 = 0
0.197551 + 0.916195 = 0
0.335223 + 0.635712 = 0
0.76823 + 0.717297 = 0
0.277775 + 0.141603 = 0
0.55397 + 0.606969 = 0
The problem as you can see, the result is always what I initialized vector C, I do not understand why. I also initialized vectorC with some other values and again the result was the initial values.
It's probably just a syntax error.
auto err = program.build("cl.std.CL1.2");
should be
auto err = program.build("-cl-std=CL1.2");
The documentation on clBuildProgram has more information about the supported options.
The problem stems from building the program with this command
auto err = program.build("cl.std.CL1.2");
and by replacing the command above with
auto err = program.build();
The problem solved.
But still I do not know why this happened. Any idea?
I have read this post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D among many others including NVIDIA docs and I can't get cudaMallocPitch to work together with cudaMemcpy2D.
I need to copy a very big matrix in an array format (Matrix[width*height]) along with a simple array to perform Matrix * vector operations. It is not optional for me to use cudaMallocPitch in order to avoid conflicts and have a better performance.
So, I started by just trying to copy the matrix (vector in my case) to the device and check if it was correctly copied but my code does not print anything. If I use cudaMalloc and cudaMemcpy everything works fine. But I do not know what to do with cudaMallocPitch and cudaMemcpy2D.
What can I do to fix this?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
{
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
}
}
int main()
{
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaFree(d_A);
delete [] A;
return 0;
}
Error checking can make a big difference in debugging. You should always use it before coming here.
It wasn't clear if you wanted a row or column vector i.e. a matrix of [1xN] or [Nx1]
I've added an explanation on Talomnies suggestion, but first the 'working slabs of code'
Here's [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
[1xN]:
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
Explanation
Firslty, Error Handling:
Considering how easy error handling is in CUDA there isn't a good excuse not to put it in.
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
Second, you didn't specify if you wanted a column vector or a row vector. Since a row vector is simply a 1-D array in linear memory and you don't need pitched memory to do that, I will assume for this explanation that you meant a column vector.
The reoccurring problem you were having was "misaligned address" in the kernel. This indicates that the problem is book-keeping, so lets walk through the three major steps of handling an aligned 2D array (even though our arrays will be either a column or row vector).
Allocating:
Your allocation was written out as
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
This is correct for the row vector as the API is cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) However if we would like to do a column vector correct call is
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
Accessing:
For accessing you were mixing up accessing a row, and accessing an element in the row.
double e = *(double *)(((char *) A + idx * mpitch) + N);
Once again stick to the documentation. The API documentation for cudaMallocPitch includes
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
for us this translates into
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
I've used column = 0 for completeness since we do not have more than one column.
Copying:
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
For this case this is correct. API for cudaMemcpy2D is
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);
I am trying to add 2 arrays using CUDA , but it didn't work .
I did all that it should be done:
1) I parallelized the VectorAdd function
2) I allocated memory to the GPu and moved the data to the GPU
3) And last thing i modified the function VectorAdd to run on the GPU
This is the code :
#define SIZE 1024
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
int i = threadIdx.x ;
if(i < n)
c[i] = a[i] + b[i];
}
int main()
{
int *a , *b , *c;
int *d_a , *d_b , *d_c;
a = (int *)malloc(SIZE * sizeof(int));
b = (int *)malloc(SIZE * sizeof(int));
c = (int *)malloc(SIZE * sizeof(int));
cudaMalloc( &d_a , SIZE * sizeof(int) );
cudaMalloc( &d_b , SIZE * sizeof(int) );
cudaMalloc( &d_c , SIZE * sizeof(int) );
for ( int i = 0 ; i < SIZE ; ++i)
{
a[i] = i ;
b[i] = i ;
c[i] = 0 ;
}
cudaMemcpy(d_a, a, SIZE *sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE *sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE *sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0 ; i < 10 ; ++i)
{
printf("C[%d] = %d\n", i, c[i]);
}
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
The output on the console is this :
c[0] = 0 , c[1] = 0 , c[2] = 0 , c[3] = 0 , c[4] = 0 ....
Why is that it should be :
c[0] = 0 ; c[1] = 2 ; c[2] = 4 ....
In your case the problem depends on your used gpu. Your kernel is launched with 1024 threads per block. Since your gpu is of compute capability 1.x only 512 or 768 threads per block are supported. A detailed list can be found in the official programming guide.
Because you didn't use proper cuda error checking, you weren't possible to get the error returned by the cuda runtime api. A good guide for cuda error checking is given by #talonmies in this SO answer/question.
There's code, that uses GPU:
__global__ void gpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
float* in_d, *w_d, *out_d;
cudaMalloc((void**)&in_d, psize * sizeof(float));
cudaMalloc((void**)&w_d, psize * size * sizeof(float));
cudaMalloc((void**)&out_d, size * sizeof(float));
for(size_t i = 0; i < size; i++)
output[i] = 0;
cudaMemcpy(in_d, input, psize * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(w_d, weights, psize * size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(out_d, output, size * sizeof(float), cudaMemcpyHostToDevice);
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(ceil(float(rx) / block_x), ceil(float(ry) / block_y));
gpu_process<<<dimGrid, dimBlock>>>(in_d, w_d, out_d, psize, size);
cudaThreadSynchronize();
cudaMemcpy(output, out_d, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(in_d);
cudaFree(out_d);
cudaFree(w_d);
}
There's code, that do the same thing, but uses only CPU:
int blockIdxx, blockIdxy, blockDimx, blockDimy, threadIdxx, threadIdxy;
void cpu_process(float* input, float* weights, float* output, int psize, int size)
{
int i = blockIdxx*blockDimx + threadIdxx;
int j = blockIdxy*blockDimy + threadIdxy;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
}
void process(float* input, float* weights, float* output, size_t psize, size_t size)
{
for(size_t i = 0; i < size; i++)
output[i] = 0;
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
blockDimx = block_x;
blockDimy = block_y;
int gridDimx = ceil(float(rx) / block_x), gridDimy = ceil(float(ry) / block_y);
for(blockIdxx = 0; blockIdxx < gridDimx; blockIdxx++)
for(blockIdxy = 0; blockIdxy < gridDimy; blockIdxy++)
for(threadIdxx = 0; threadIdxx < blockDimx; threadIdxx++)
for(threadIdxy = 0; threadIdxy < blockDimy; threadIdxy++)
cpu_process(input, weights, output, psize, size);
}
Why CPU variant works correctly but GPU variant returns garbage in output? What differs in
Version of cuda-toolkit: 4.0
OS: Debian GNU/Linux, cuda installed from it's repositories.
GPU: NVIDIA GeForce GT 525M.
cudaThreadSyncronize is deprecated and should not be used, instead use cudaDeviceSyncronize, check the error codes of these, since they will return an error if a thread has failed. These also block all code thereafter until the task is completed, so you could also add some timing code inbetween to find bottlenecks.