Got the initial value from GPU programming in OpenCL - c++

I developed a small code to add two small vector using GPU by OpenCL library. The main code vectorAdd.cc is as follows:
#include <iostream>
#include <CL/cl.hpp>
#include <cassert>
#include <fstream>
#include <time.h>
#include <cmath>
void randomInit(float *data, int size)
{
for (unsigned int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
int main()
{
//get all platforms (drivers)
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
assert(platforms.size() > 0);
cl::Platform myPlatform = platforms[0];
std::cout << "Using platform: "<<myPlatform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> devices;
myPlatform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
assert(devices.size() > 0);
cl::Device myDevice = devices[0];
std::cout<< "Using device: "<<myDevice.getInfo<CL_DEVICE_NAME>()<<"\n";
std::ifstream vectorAddFile("vector_add_kernel.cl" );
std::string src(std::istreambuf_iterator<char>(vectorAddFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Context context(myDevice);
cl::Program program(context, sources);
int szVec = 10;
float* A = new float[szVec];
float* B = new float[szVec];
randomInit(A,szVec);
randomInit(B,szVec);
float* C = new float[szVec];
std::fill_n(C, szVec, 0);
// create buffers on the device
cl::Buffer buffer_A = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), A);
cl::Buffer buffer_B = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), B);
cl::Buffer buffer_C = cl::Buffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), C);
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, myDevice);
//write arrays A and B to the device
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * szVec, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * szVec, B);
auto err = program.build("cl.std.CL1.2");
// run the kernel
cl::Kernel kernel(program,"vector_add", &err);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(szVec), cl::NullRange);
queue.finish();
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float) * szVec, C);
std::cout<<" result: \n";
for(int i = 0; i < szVec; i++)
{
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
}
std::cout << std::endl;
return 0;
}
and the kernel code vector_add_kernel.cl is as follows:
__kernel void vector_add(__global float *A, __global float *B, __global float *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
and the result i got is:
Using platform: NVIDIA CUDA
Using device: Tesla K20m
result:
0.840188 + 0.477397 = 0
0.394383 + 0.628871 = 0
0.783099 + 0.364784 = 0
0.79844 + 0.513401 = 0
0.911647 + 0.95223 = 0
0.197551 + 0.916195 = 0
0.335223 + 0.635712 = 0
0.76823 + 0.717297 = 0
0.277775 + 0.141603 = 0
0.55397 + 0.606969 = 0
The problem as you can see, the result is always what I initialized vector C, I do not understand why. I also initialized vectorC with some other values and again the result was the initial values.

It's probably just a syntax error.
auto err = program.build("cl.std.CL1.2");
should be
auto err = program.build("-cl-std=CL1.2");
The documentation on clBuildProgram has more information about the supported options.

The problem stems from building the program with this command
auto err = program.build("cl.std.CL1.2");
and by replacing the command above with
auto err = program.build();
The problem solved.
But still I do not know why this happened. Any idea?

Related

OpenCL method get_global_id() works incorrectly on GPU

I want to parallelize temperatures distribution, using OpenCL technology. I stocked on problem with my GPU - work item id for every other kernel function are the same. Instead of result, for example, from 0 to 1024, I got this result. What I did incorrectcly?
enter image description here
Source.cpp
include <iostream>
#include <string>
#include <fstream>
#include <omp.h>
#include <CL/cl.hpp>
float*** distributeOpenCL(float*** cuboid, int k, int m, int n)
{
// OpenCL init
int size = k * m * n;
float*** hResult = initCuboid(k, m, n);
cl_platform_id platform;
cl_device_id device;
cl_int error = 0;
std::ifstream file("program.cl");
std::string fileText = std::string(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>());
const char* srcText = fileText.data();
size_t srcLength = fileText.size();
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_mem dCuboid, dRes;
size_t localSize[2] = { k,m };
size_t globalSize[2] = { ceil(size / (float)localSize[0]) * localSize[0], ceil(size / (float)localSize[1]) * localSize[1] };
// Get GPU
error |= clGetPlatformIDs(1, &platform, NULL);
error |= clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Compile and build
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
program = clCreateProgramWithSource(context, 1, &srcText, &srcLength, &error);
error |= clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// What funtion from file we have to run
kernel = clCreateKernel(program, "distributeKernel", &error);
// Add to Queue
queue = clCreateCommandQueueWithProperties(context, device, NULL, &error);
// Create buffer
dCuboid = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
dRes = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
// Write data to buffer
error |= clEnqueueWriteBuffer(queue, dCuboid, CL_TRUE, 0, sizeof(float) * size, cuboid, 0, NULL, NULL);
// Kernel args
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &dCuboid);
error |= clSetKernelArg(kernel, 1, sizeof(int), &k);
error |= clSetKernelArg(kernel, 2, sizeof(int), &m);
error |= clSetKernelArg(kernel, 3, sizeof(int), &n);
error |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dRes);
// Start task
error |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
// Wait execution
clFinish(queue);
// Read Result
error |= clEnqueueReadBuffer(queue, dRes, CL_TRUE, 0, sizeof(float) * size, hResult, 0, NULL, NULL);
//printCuboid(resP, k, m, n, resPFile);
// Deallocation
clReleaseKernel(kernel);
clReleaseMemObject(dCuboid);
clReleaseMemObject(dRes);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return hResult;
}
int main(int argc, char* argv[])
{
std::ofstream filledFile("filled.txt");
std::ofstream resLFile("resL.txt");
std::ofstream resPFile("resP.txt");
double durationL, durationP, time1, time2;
int k = 5, m = 5, n = 5, temp1 = 10, temp2 = 15;
float*** cuboid, *** resL, *** resP;
if (argc > 1) {
k = atoi(argv[1]), m = atoi(argv[2]), n = atoi(argv[3]),
temp1 = atoi(argv[4]), temp2 = atoi(argv[5]);
}
// Linear
cuboid = initCuboid(k, m, n);
fillCuboid(cuboid, k, m, n, temp1, temp2);
printCuboidToFile(cuboid, k, m, n, filledFile);
time1 = omp_get_wtime();
resL = distribute(cuboid, k, m, n);
time2 = omp_get_wtime();
durationL = time2 - time1;
printCuboidToFile(resL, k, m, n, resLFile);
// Parallel
time1 = omp_get_wtime();
resP = distributeOpenCL(cuboid, k, m, n);
time2 = omp_get_wtime();
durationP = time2 - time1;
//printCuboidToFile(resP, k, m, n, resPFile);
std::cout << "Linear time: " << durationL << std::endl;
std::cout << "Parallel time: " << durationP << std::endl;
std::cout << "Parallel faster than linear on: " << durationL - durationP << std::endl;
// Delete 3d arrays, closing files
deleteCuboid(cuboid, k, m, n);
deleteCuboid(resL, k, m, n);
deleteCuboid(resP, k, m, n);
filledFile.close();
resLFile.close();
resPFile.close();
return 0;
}
program.cl
__kernel void distributeKernel(__global float*** cuboid, int k, int m, int n, __global float*** result)
{
int gz = get_global_id(0);
int gy = get_global_id(1);
printf("gy - %d \n", &gy);
printf("gz - %d \n", &gz);
bool isDissipated = false;
int size = k * m * n;
// Ends if temperatures in cube becomes balanced
while (!isDissipated) {
int dissipatedCount = 0;
for (int x = 0; x < n; x++) {
// Calc average temperature
float sum = 0;
int count = 0;
float average;
for (int zSum = gz - 1; zSum <= gz + 1; zSum++) {
for (int ySum = gy - 1; ySum <= gy + 1; ySum++) {
for (int xSum = x - 1; xSum <= x + 1; xSum++) {
if (zSum >= 0 && ySum >= 0 && xSum >= 0
&& zSum < k && ySum < m && xSum < n) {
count++;
sum += result[gz][gy][xSum];
}
}
}
}
average = round(sum / count * 100) / 100;
if (average == result[gz][gy][x]) {
dissipatedCount++;
}
else {
result[gz][gy][x] = average;
}
}
if (dissipatedCount == size) {
isDissipated = true;
}
}
}
To get the issue with the supposedly wrong get_global_id() fixed, start with a simple, minimal "Hello World"-style vector addition program and than advance forward to your temperature distribution application step-by-step.
With your code I see several issues:
You can only have 1D pointers (with a single *) in OpenCL.
__kernel void distributeKernel(__global float* cuboid, __global float* result)
Introduce a linear index to access more than 1 dimension: For 2D for example int n = x+y*get_global_size(0);
From what I see, k, m, n are lattice dimensions. Eliminate them from the kernel entirely. Get size via get_global_size(...).
The kernel looks rather complex with a lot of loops and branching. This could kill any performance benefit you hope to get from GPU parallelization. Get rid of loops and branching as far as possible. Also, there should not be any loop over one of the lattice dimensions since the lattice position is what you parallelize.
I would also advice to use only 1D parallelization in OpenCL and do the linear indexing yourself. This gives you more flexibility regarding workgroup size.

Trying to run a CusolverSSgels testcase, however it is not working

I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7 according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.
The output says that niter= -3, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int dinfo = 0;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << dinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
The results I get are:
Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08
Could someone point out what I am doing wrong?
You have a problem with your dinfo parameter usage. Referring to the documentation, we see that:
Parameters of cusolverDngels() functions
parameter Memory In/out Meaning
dinfo device output Status of the IRS solver on the return. If 0 - solve was successful. If dinfo = -i then i-th argument is not valid.
the dinfo parameter is expected to live in device memory. But you have it in host memory:
int dinfo = 0;
If I move the storage to the proper location, your code outputs the values you indicate as expected:
$ cat t143.cu
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int *dinfo, hinfo;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
cudaMalloc(&dinfo, sizeof(int));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << hinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
$ nvcc -o t143 t143.cu -lcublas -lcusolver
$ cuda-memcheck ./t143
========= CUDA-MEMCHECK
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
$
Notes:
I am using CUDA 11.3 for the above. If you are using an earlier version, I strongly recommend you move forward to CUDA 11.3 or newer for usage of this function.
You can get a hint as to the problem by running your code with cuda-memcheck
It was fairly quick to spot the problem by reviewing your parameter usage with the table of parameter locations (host/device) given in the documentation. You had a problem here which was similar in that you could focus in on the problem by reviewing your parameter locations (host/device) against the table given in the documentation. This may be a good thing to check to save yourself time in the future.

Not able to get the cublasSgelsbatched function to work

I'm currently trying to get the cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) version to work. I started by first making a small test case to see what parameters are needed exactly and how they need to be inputted. However after much trial and error I still can't get it to work, I get a status return of 13, which corresponds to CUBLAS_STATUS_EXECUTION_FAILED which is a very vague error, also I tried some other cublas testcases and they seem to be working fine. I also tested the input matrix in MATlab, which does have a LS solution.
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
&gAmat,
lda, //or 1
&gYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
I'm on Windows 10 running in MVS using CUDA 9.0
I'd really appreciate some help
As pointed out in the comments, you are not creating a proper array of pointers on the device. The batched function works with an array of pointers that lives in device memory, for the data parameters, for example:
Aarray device input/output array of pointers to array, with each array of dim. m x n with lda>=max(1,m). Matrices Aarray[i] should not overlap; otherwise, undefined behavior is expected.
Passing for example &gAmat seems to satisfy the type requirement, but that pointer does not point to device memory.
The following modifications to your code focused on proper handling of gAmat and gYmat seem to run without error for me:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used
t130.cu(24): warning: variable "ldb" was declared but never referenced
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$
Your code only shows a single array. If you had a batch of arrays, you would pass an actual array of device-allocated pointers, for each of A and Y.
Based on comments below, here is a version of the code using non-random input:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t status;
// create handle
status = cublasCreate(&m_cuBLAS);
std::cout << "status = " << status << std::endl;
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$

Shared memory in OpenCL

I intend to perform vector manipulations and was trying a small dummy program with vector addition and multiplication. However, the code does not run due to limitations on my knowledge on shared memory. All the sources in the internet show 2D matrix operations which I cannot translate to my vector problems. Please try to explain where am I going wrong considering the fact I am a novice in OpenCL. The code is given below:
Host Code:
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Context context;
cl::CommandQueue queue;
cl::Program program;
cl::Kernel kernel;
cl::Platform::get(&platforms);
deviceUsed = 0;
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(),0 };
context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = cl::CommandQueue(context, devices[deviceUsed]);
cl::Program::Sources source( 1, std::make_pair(kernel_source.c_str(), kernel_source.size()));
program = cl::Program(context, source);
program.build(devices);
std::vector < float > a;
std::vector < float > b;
std::vector < float > sum;
std::vector < float > prod;
int globalSize = 128;
int localSize = 16;
a.resize(globalSize);
b.resize(globalSize);
sum.resize(globalSize);
prod.resize(globalSize);
for (int i = 0; i < globalSize ; i++)
{
a[i] = 1.0f * i;
b[i] = 5.0f * i;
}
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_sum;
cl::Buffer buffer_prod;
buffer_A = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
buffer_B = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * globalSize , &a[0]);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * globalSize , &b[0]);
buffer_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
buffer_prod = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_sum);
kernel.setArg(3, buffer_prod);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize/localSize), cl::NDRange(N), NULL);
queue.finish();
queue.enqueueReadBuffer(buffer_sum, CL_TRUE, 0, sizeof(float) * globalSize, &sum[0]);
queue.enqueueReadBuffer(buffer_prod, CL_TRUE, 0, sizeof(float) * globalSize, &prod[0]);
Kernel:
#define STRINGI(ker) #ker
std::string kernel_source = STRINGI(
__kernel void KernelAddMul(__global float* a, __global float* b, __global float* sum, __global float* prod)
{
unsigned int j = get_local_id(0);
int N = get_local_size(0);
unsigned int i = N * get_global_id(0) + j;
float locSum[N];
float locProd[N];
__local float Asub[N];
__local float Bsub[N];
for(int k = 0; k < N; k++){
Asub[k] = a[i];
Bsub[k] = b[i];
barrier(CLK_LOCAL_MEM_FENCE);
locSum[k] = Asub[k] + Bsub[k];
locProd[k] = Asub[k] * Bsub[k];
barrier(CLK_LOCAL_MEM_FENCE);
sum[i] = locSum[k];
prod[i] = locProd[k];
}
}
);
I suspect that your code does not run because your kernel does not compile.
The following lines are invalid:
int N = get_local_size(0);
float locSum[N];
float locProd[N];
__local float Asub[N];
__local float Bsub[N];
N must be a constant, you cannot dynamically size the arrays using get_local_size(0).
I strongly recommend that you use a standalone compiler to compile your kernels:
CodeXL is very good, as is the Intel
SDK for OpenCL.
Anything is better than trying to debug your kernel in an application!

OpenCL kernel doesn't run correctly

I have following kernel in my project:
__kernel void zero(__global float* vh)
{
const float2 id = (float2)(get_global_id(0),get_global_id(1));
const float2 sz = (float2)(1,get_global_size(0));
vh[(int)dot(id,sz)] = 1;
}
And here's the way I launch it
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Platform::get(&platforms);
platforms.at(0).getDevices(CL_DEVICE_TYPE_GPU, &devices);
std::string GPUname;
devices.at(0).getInfo(CL_DEVICE_NAME, &GPUname);
std::cout << "Program runs on GPU: " << GPUname << std::endl;
cl::Context context(devices);
cl::CommandQueue queue(context, devices.at(0));
std::ifstream srcfile("kernels.cl");
std::string src(std::istreambuf_iterator<char>(srcfile), std::istreambuf_iterator<char>(0));
cl::Program program(context, cl::Program::Sources(1, std::make_pair(src.c_str(), src.size())));
program.build(devices, "-Werror");
cl::Kernel kzero = cl::Kernel(program, "zero");
cl::Buffer buffer(context, CL_MEM_READ_ONLY, N * N * sizeof(float));
cl::NDRange gndr(N, N), lndr(8, 8);
kzero.setArg(0, buffer);
queue.enqueueNDRangeKernel(kzero, cl::NullRange, gndr, lndr);
float data[N * N];
queue.enqueueReadBuffer(buffer, CL_TRUE, 0, N * N * sizeof(float), data);
queue.finish();
std::ofstream ofs;
ofs.open("solution.txt", std::ofstream::out);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
ofs << data[i + N * j] << (j == N - 1 ? '\n' : ' ');
ofs.close();
std::cout << "File written";
return 0;
But every time I run my program the file contains only zeros, although it should be ones. Where can be the error? I'm stuck in it?