simple cuda graph example doesn't product expected result

simple cuda graph example doesn't product expected result - c++

I am testing out cuda graphs. My graph is as follows.
the code for this is as follows
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#define NumThreads 20
#define NumBlocks 1
template <typename PtrType>
__global__ void kernel1(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
buffer[tid] = (PtrType)tid;
}
template <typename PtrType>
__global__ void kernel2(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < numElems/2) buffer[tid] += 5;
}
template <typename PtrType>
__global__ void kernel3(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid>=numElems/2) buffer[tid] *= 5;
}
template <typename PtrType>
void print(void *data) {
PtrType *buffer = (PtrType *)data;
std::cout << "[";
for (unsigned int i = 0; i < NumThreads; ++i) {
std::cout << buffer[i] << ",";
}
std::cout << "]\n";
}
void runCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream) {
cudaGraphInstantiate(&graphExec, Graph, nullptr, nullptr, 0);
cudaStreamCreateWithFlags(&graphStream, cudaStreamNonBlocking);
cudaGraphLaunch(graphExec, graphStream);
cudaStreamSynchronize(graphStream);
}
void destroyCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream) {
cudaCtxResetPersistingL2Cache();
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(Graph);
cudaStreamDestroy(graphStream);
cudaDeviceReset();
}
template <typename PtrType>
void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream, PtrType *buffer,
unsigned int numElems, PtrType *hostBuffer) {
cudaGraphCreate(&Graph, 0);
cudaGraphNode_t Kernel1;
cudaKernelNodeParams nodeParams = {0};
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel1<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
void *inputs[2];
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = nullptr;
cudaGraphAddKernelNode(&Kernel1, Graph, nullptr, 0, &nodeParams);
cudaGraphNode_t Kernel2;
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel2<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = NULL;
cudaGraphAddKernelNode(&Kernel2, Graph, &Kernel1, 1, &nodeParams);
cudaGraphNode_t Kernel3;
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel3<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = NULL;
cudaGraphAddKernelNode(&Kernel3, Graph, &Kernel1, 1, &nodeParams);
cudaGraphNode_t copyBuffer;
std::vector<cudaGraphNode_t> dependencies = {Kernel2, Kernel3};
cudaGraphAddMemcpyNode1D(&copyBuffer, Graph,dependencies.data(),dependencies.size(),hostBuffer, buffer, numElems*sizeof(PtrType), cudaMemcpyDeviceToHost);
cudaGraphNode_t Host1;
cudaHostNodeParams hostNodeParams;
memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = print<PtrType>;
hostNodeParams.userData = (void *)&hostBuffer;
cudaGraphAddHostNode(&Host1, Graph, &copyBuffer, 1,
&hostNodeParams);
}
int main() {
cudaGraph_t graph;
cudaGraphExec_t graphExec;
cudaStream_t graphStream;
unsigned int numElems = NumThreads;
unsigned int bufferSizeBytes = numElems * sizeof(unsigned int);
unsigned int hostBuffer[numElems];
memset(hostBuffer, 0, bufferSizeBytes);
unsigned int *deviceBuffer;
cudaMalloc(&deviceBuffer, bufferSizeBytes);
createCudaGraph(graph, graphExec, graphStream, deviceBuffer,numElems, hostBuffer);
runCudaGraph(graph, graphExec, graphStream);
destroyCudaGraph(graph, graphExec, graphStream);
std::cout << "graph example done!" << std::endl;
}
When I run this example I get a result of
[3593293488,22096,3561843129,22096,3561385808,22096,3593293488,22096,3598681264,22096,3561792984,22096,2687342880,0,0,0,3598597376,22096,3598599312,0,]
However I expect:
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
I can't figure out where I went wrong. I used cuda-gdb and it seems right on the GPU. However, somewhere in the memCpy and sending to host function it goes wrong. Any ideas?

You are not passing the correct pointer to the host callback.
void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream, PtrType *buffer,
unsigned int numElems, PtrType *hostBuffer) {
...
hostNodeParams.userData = (void *)&hostBuffer;
}
This takes the address of a stack variable which is no longer valid when the host function is called.
Since hostBuffer already points to the array you want to print, you can just pass this pointer directly.
hostNodeParams.userData = (void *)hostBuffer;
That aside, I would like to mention that there is a handy function
cudaGraphDebugDotPrint which can output a graph to file that can be converted to png to help with debugging. With your original code, it will show that the pointer used as memcpy destination and the pointer passed to the host function are different.

Related

C++ OpenCL Abstraction Not giving desired result

I tried to do an basic abstraction for OpenCL Here it is:
OpenCLBuffer:
Header:
class OpenCLBuffer {
void* GetNativeID() { return obj; }
cl_mem obj;
cl_command_queue commandQueue;
cl_context context;
cl_int ret;
int type;
int maxSize;
int currSize;
};
Impl:
OpenCLBuffer::OpenCLBuffer(cl_context cont, cl_command_queue queue, cl_int t, unsigned int size)
{
context = cont;
commandQueue = queue;
maxSize = size;
type = t;
obj = clCreateBuffer(context, t, size, NULL, &ret);
}
OpenCLBuffer::~OpenCLBuffer()
{
ret = clReleaseMemObject(obj);
}
void OpenCLBuffer::SetData(int size, void* data, int offset)
{
currSize = size;
ret = clEnqueueWriteBuffer(commandQueue, obj, CL_TRUE, offset, size, data, 0, NULL, NULL);
}
void OpenCLBuffer::GetData(void* data, int size)
{
if (size == -1)
size = currSize;
ret = clEnqueueReadBuffer(commandQueue, obj, CL_TRUE, 0, size, data, 0, NULL, NULL);
}
OpenClContext:
Header:
class OpenCLContext {
// I removed the fuc definations from question as they are already in the Impl part
cl_platform_id plarformId;
cl_device_id deviceId;
cl_context context;
cl_uint numDevices;
cl_uint numPlatforms;
cl_command_queue commandQueue;
cl_int ret;
char name[1024];
};
ImPl:
static void _stdcall OpenCLErrorFunc(const char* errinfo, const void* private_info, size_t cb, void* user_data){
std::cout << "OpenCL (" << user_data << ") Error : \n" << errinfo << "\n";
}
OpenCLContext::OpenCLContext(std::string n)
{
ret = clGetPlatformIDs(1, &plarformId, &numPlatforms);
ret = clGetDeviceIDs(plarformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceId, &numDevices);
context = clCreateContext(NULL, 1, &deviceId, OpenCLErrorFunc, name, &ret);
commandQueue = clCreateCommandQueue(context, deviceId, 0, &ret);
memcpy_s(name, 1024, n.data(), std::min(1024, (int)n.size()));
}
OpenCLContext::~OpenCLContext()
{
for (std::pair<std::string, char*> data : sources) {
if (data.second)
delete data.second;
}
ret = clFlush(commandQueue);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
}
OpenCLBuffer* OpenCLContext::CreateBuffer(void* data, int size, int type)
{
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
buffer->SetData(size, data);
return buffer;
}
OpenCLBuffer* OpenCLContext::CreateBuffer(int size, int type)
{
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
return buffer;
}
void OpenCLContext::AddProgram(std::string name, std::string source)
{
char* sc = new char[source.size()];
memcpy_s(sc, source.size(), source.data(), source.size());
sources[name] = sc;
int sourceSize = source.size();
programs[name] = clCreateProgramWithSource(context, 1, (const char**)&sc, (const size_t*)&sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
}
void OpenCLContext::MakeKernel(std::string programName, std::string kernelName)
{
kernels[kernelName] = clCreateKernel(programs[programName], kernelName.c_str(), &ret);
}
void OpenCLContext::SetKernelArg(std::string kernelName, int num, int size, void* arg)
{
ret = clSetKernelArg(kernels[kernelName], num, size, arg);
}
void OpenCLContext::ReleaseKernerl(std::string kernelName)
{
ret = clFlush(commandQueue);
ret = clReleaseKernel(kernels[kernelName]);
}
void OpenCLContext::ReleaseProgram(std::string programName)
{
ret = clFlush(commandQueue);
ret = clReleaseProgram(programs[programName]);
}
void OpenCLContext::Dispatch(std::string kernelName, int globalItemSize, int localItemSize)
{
ret = clEnqueueNDRangeKernel(commandQueue, kernels[kernelName], 1, NULL, (const size_t*)&globalItemSize, (const size_t*)&localItemSize, 0, NULL, NULL);
}
Driver Code:
std::string shadersrc = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
)";
const int LIST_SIZE = 1024;
int* A = (int*)malloc(sizeof(int) * LIST_SIZE);
int* B = (int*)malloc(sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
context = new OpenCLContext("Vector Adder");
a = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
b = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
c = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_WRITE_ONLY);
a->SetData(LIST_SIZE * sizeof(int), A);
b->SetData(LIST_SIZE * sizeof(int), B);
context->AddProgram("VectorAdderSrc", shadersrc);
context->MakeKernel("VectorAdderSrc", "vector_add");
context->SetKernelArg("vector_add", 0, sizeof(cl_mem), a->GetNativeID());
context->SetKernelArg("vector_add", 1, sizeof(cl_mem), b->GetNativeID());
context->SetKernelArg("vector_add", 2, sizeof(cl_mem), c->GetNativeID());
context->Dispatch("vector_add", LIST_SIZE, 64);
int* C = (int*)malloc(sizeof(int) * LIST_SIZE);
memset(C, 0, sizeof(int) * LIST_SIZE);
c->GetData(c, sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
Sometimes i am getting Read Access Violation and sometimes:
0 + 1024 = 0
1 + 1023 = 0
2 + 1022 = 0
3 + 1021 = 0
...
Then crash.
Could you please help me find the problems?

First, few general tips to make debugging easier:
Always check return values. Every OpenCL API reports the errors via return value or a reference parameter.
At the first occurrence of an error you should stop the rest of the program as it will most likely not work. Throwing exceptions are a good strategy.
Specifically for OpenCL, the error codes are defined in the main header cl.h, you can find an code->string mapping routine here:Convenient way to show OpenCL error codes?
Regarding your code, the first error came from your function AddProgram. The function clCreateProgramWithSource returns CL_OUT_OF_HOST_MEMORY. Your decision to cast sourceSize from int* to size_t* is problematic since they are not the same size, and the API reads a corrupted 64-bit value.
Here is a better implementation:
void AddProgram(std::string name, std::string source)
{
const char* sc = source.c_str();
size_t sourceSize[1] = {source.size()};
programs[name] = clCreateProgramWithSource(context, 1, &sc, sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
}
There is no need to keep the source code in memory after it was compiled, but if you really want to, I suggest managing std::string objects because they save the hustle of managing memory.
The next problem is that clEnqueueNDRangeKernel returns CL_INVALID_WORK_GROUP_SIZE. Here you can find the int* to size_t* cast problem again, which passes bad arguments to the function.
Finally, your call to SetKernelArg returns CL_INVALID_MEM_OBJECT, and this is because the last argument, in case of OpenCL buffers, expected to be a pointer pointer to a cl_mem object (in your case - address of cl_mem that is returned by GetNativeID()).
Finally, there is a typo in the line c->GetData(c, sizeof(int) * LIST_SIZE); as it should be c->GetData(C, sizeof(int) * LIST_SIZE);
That should make it work. Please pay attention to the tips above, and avoid C casts in favor of C++ casts.

CublasLt cublasLtMatmulAlgoGetHeuristic returns CUBLAS_STATUS_INVALID_VALUE for rows major matrix

I've just finished to refactor my program to use cublasLt lib for GEMM and I fell into a CUBLAS_STATUS_INVALID_VALUE when executing cublasLtMatmulAlgoGetHeuristic in the function below.
CudaMatrix.cu:product
/**
* Performs the matrix-matrix multiplication C = A x B
*
* #see https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
*
* #param A - The left matrix A
* #param B - The right matrix B
* #param C - The result matrix C
* #param opA - Operation to perform on matrix A before multiplication (none, transpose or hermitian)
* #param opB - Operation to perform on matrix B before multiplication (none, transpose or hermitian)
* #param lightHandle - cublasLt handle
*/
template<typename precision>
void CudaMatrix<precision>::product(const CudaMatrix &A,
const CudaMatrix &B,
CudaMatrix &C,
cublasOperation_t opA,
cublasOperation_t opB,
cublasLtHandle_t lightHandle
) {
const precision zero = 0,
one = 1;
const int requestedAlgoCount = 1;
cudaStream_t stream = nullptr;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
cublasLtMatmulDesc_t computeDesc;
int returnedAlgoCount;
// Set matrix pre-operation such as transpose if any
cublasLtCk(cublasLtMatmulDescCreate(&computeDesc, A.cublasLtDataType));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opB, sizeof(opB)));
// Get the best algorithm to use
cublasLtCk(cublasLtMatmulPreferenceCreate(&preference));
cublasLtCk(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&CudaMatrix::matMulWorkspaceSize, sizeof(CudaMatrix::matMulWorkspaceSize)));
cublasLtCk(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, A.matrixLayout, B.matrixLayout,
C.matrixLayout, C.matrixLayout, preference, requestedAlgoCount, &heuristicResult, &returnedAlgoCount));
std::cout << "returnedAlgoCount = " << returnedAlgoCount << std::endl;
// Do the multiplication
cublasLtCk(cublasLtMatmul(lightHandle, computeDesc, &one, A.data, A.matrixLayout, B.data, B.matrixLayout, &zero,
C.data, C.matrixLayout, C.data, C.matrixLayout, &heuristicResult.algo,
&CudaMatrix::matMulWorkspace, CudaMatrix::matMulWorkspaceSize, stream));
// clean up
cublasLtCk(cublasLtMatmulPreferenceDestroy(preference));
cublasLtCk(cublasLtMatmulDescDestroy(computeDesc));
}
I concatenated a minimal reproducible example below with the same source code as I have in my program (with trims).
This error may be related to a bug I found in NVIDIA forum but I am not sure.
I'm running on Ubuntu 18.04 with RTX 5000 GPU.
cublaslt_mat_mul.cu
#include <iostream>
#include <iomanip>
#include <limits>
#include <vector>
#include <cxxabi.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cublasLt.h>
// ****************************************************************************************************************** //
// ErrorsCheck.cuh //
// ****************************************************************************************************************** //
static const char* cublasGetErrorEnum(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
default:
return "<unknown>";
}
}
inline void cublasLtCheck(cublasStatus_t status, int iLine, const char *szFile) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "CublasLt error " << cublasGetErrorEnum(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
}
}
inline void cudaCheck(cudaError_t status, int iLine, const char *szFile) {
if (status != cudaSuccess) {
std::cerr << "CublasLt error " << cudaGetErrorString(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
}
}
#define cublasLtCk(call) cublasLtCheck(call, __LINE__, __FILE__)
#define cudaCk(call) cudaCheck(call, __LINE__, __FILE__)
// ****************************************************************************************************************** //
// CudaMatrix.cuh //
// ****************************************************************************************************************** //
#define MB 1048576 // 2^19 byte
typedef unsigned int uint;
template <typename precision>
struct CudaMatrix {
// Matrix multiplication GPU workspace that can be used to improve matrix multiplication computation time
const static void *matMulWorkspace;
const static size_t matMulWorkspaceSize;
CudaMatrix() : width(0), height(0), data(nullptr), cublasHandle(nullptr), cublasLtHandle(nullptr), matrixLayout(nullptr) { };
CudaMatrix(uint width, uint height, cublasHandle_t cublasHandle = nullptr, cublasLtHandle_t cublasLtHandle = nullptr,
cublasLtMatrixLayout_t matrixLayout = nullptr) : width(width), height(height), cublasHandle(cublasHandle),
cublasLtHandle(cublasLtHandle), matrixLayout(matrixLayout)
{
cudaCk(cudaMalloc(&data, bytesSize()));
if (typeid(precision).hash_code() == typeid(uint).hash_code()) {
cublasLtDataType = CUDA_R_8U;
} else if (typeid(precision).hash_code() == typeid(int).hash_code()) {
cublasLtDataType = CUDA_R_8I;
} else if (typeid(precision).hash_code() == typeid(float).hash_code()) {
cublasLtDataType = CUDA_R_32F;
} else if (typeid(precision).hash_code() == typeid(double).hash_code()) {
cublasLtDataType = CUDA_R_64F;
} else {
throw std::runtime_error("The datatype " + std::string(typeid(precision).name()) + " is not handled in CudaMatrix");
}
cublasLtCk(cublasLtMatrixLayoutCreate(&matrixLayout, cublasLtDataType, height, width, width));
if (matMulWorkspace == nullptr) {
cudaCk(cudaMalloc(&matMulWorkspace, matMulWorkspaceSize));
}
}
__device__ __host__ uint size() const { return width * height; }
static void product(const CudaMatrix &A, const CudaMatrix &B, CudaMatrix &C, cublasOperation_t opA, cublasOperation_t opB, cublasLtHandle_t lightHandle);
void freeResources() { cudaCk(cudaFree(data)); cublasLtCk(cublasLtMatrixLayoutDestroy(matrixLayout)); }
uint bytesSize() const { return size() * sizeof(precision); }
void setValuesFromVector(const std::vector<precision> &vector);
void setValuesFromVector(const std::vector<std::vector<precision>> &vectors);
void display(const std::string &name = "", uint x = 0, uint y = 0, uint roiWidth = 0, uint roiHeight = 0) const;
void product(const CudaMatrix &A) { product(*this, A, *this, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle); }
precision *data;
uint width,
height;
cublasHandle_t cublasHandle;
cublasLtHandle_t cublasLtHandle;
cublasLtMatrixLayout_t matrixLayout;
cudaDataType_t cublasLtDataType;
};
template <typename precision> const size_t CudaMatrix<precision>::matMulWorkspaceSize = 500 * MB;
template <typename precision> const void* CudaMatrix<precision>::matMulWorkspace = nullptr;
// ****************************************************************************************************************** //
// CudaMatrix.cu //
// ****************************************************************************************************************** //
/**
* Display the matrix
*
* #tparam precision - The matrix precision
*
* #param name - The matrix name
*/
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name, uint x, uint y, uint roiWidth, uint roiHeight) const
{
precision *hostValues;
roiWidth == 0 ? roiWidth = width : roiWidth = roiWidth;
roiHeight == 0 ? roiHeight = height : roiHeight = roiHeight;
cudaCk(cudaMallocHost(&hostValues, bytesSize()));
cudaCk(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));
std::cout << std::setprecision(std::numeric_limits<precision>::digits10 + 1);
std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of "
<< abi::__cxa_demangle(typeid(precision).name(), nullptr, nullptr, nullptr)
<< "\n\n";
for (int i = y; i < y + roiHeight; ++i) {
std::cout << "{ ";
for (int j = x; j < x + roiWidth - 1; ++j) {
std::cout << *(hostValues + i * width + j) << ", ";
}
std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
}
std::cout << std::endl;
cudaCk(cudaFreeHost(hostValues));
}
/**
* Set the matrix values in device CUDA memory from a host standard 1D vector
*
* #tparam precision - The matrix precision
*
* #param vector - The values to set the device CUDA memory from
*/
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
{
cudaCk(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
}
/**
* Set the matrix values in device CUDA memory from a host standard 2D vector
*
* #tparam precision - The matrix precision
*
* #param vectors - The values to set the device CUDA memory from
*/
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<std::vector<precision>> &vectors)
{
std::vector<precision> buffer;
buffer.reserve(vectors.size() * vectors[0].size());
for (const auto &vector : vectors) {
buffer.insert(buffer.end(), vector.begin(), vector.end());
}
setValuesFromVector(buffer);
}
/**
* Performs the matrix-matrix multiplication C = A x B
*
* #see https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
*
* #param A - The left matrix A
* #param B - The right matrix B
* #param C - The result matrix C
* #param opA - Operation to perform on matrix A before multiplication (none, transpose or hermitian)
* #param opB - Operation to perform on matrix B before multiplication (none, transpose or hermitian)
* #param lightHandle - cublasLt handle
*/
template<typename precision>
void CudaMatrix<precision>::product(const CudaMatrix &A,
const CudaMatrix &B,
CudaMatrix &C,
cublasOperation_t opA,
cublasOperation_t opB,
cublasLtHandle_t lightHandle
) {
const precision zero = 0,
one = 1;
const int requestedAlgoCount = 1;
cudaStream_t stream = nullptr;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
cublasLtMatmulDesc_t computeDesc;
int returnedAlgoCount;
// Set matrix pre-operation such as transpose if any
cublasLtCk(cublasLtMatmulDescCreate(&computeDesc, A.cublasLtDataType));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opB, sizeof(opB)));
// Get the best algorithm to use
cublasLtCk(cublasLtMatmulPreferenceCreate(&preference));
cublasLtCk(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&CudaMatrix::matMulWorkspaceSize, sizeof(CudaMatrix::matMulWorkspaceSize)));
cublasLtCk(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, A.matrixLayout, B.matrixLayout,
C.matrixLayout, C.matrixLayout, preference, requestedAlgoCount, &heuristicResult, &returnedAlgoCount));
std::cout << "returnedAlgoCount = " << returnedAlgoCount << std::endl;
// Do the multiplication
cublasLtCk(cublasLtMatmul(lightHandle, computeDesc, &one, A.data, A.matrixLayout, B.data, B.matrixLayout, &zero,
C.data, C.matrixLayout, C.data, C.matrixLayout, &heuristicResult.algo,
&CudaMatrix::matMulWorkspace, CudaMatrix::matMulWorkspaceSize, stream));
// clean up
cublasLtCk(cublasLtMatmulPreferenceDestroy(preference));
cublasLtCk(cublasLtMatmulDescDestroy(computeDesc));
}
// Forward template declarations
template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;
template struct CudaMatrix<uint>;
// ****************************************************************************************************************** //
// main.cu //
// ****************************************************************************************************************** //
int main(int argc, char const *argv[])
{
cublasLtHandle_t cublasLtHandle = nullptr;
std::vector<float> r1Expect = { 6, 6, 6, 15, 15, 15, 24, 24, 24 };
std::vector<float> r2Expect = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
cublasLtCk(cublasLtCreate(&cublasLtHandle));
// Declare matrices
CudaMatrix<float> m1(3, 3);
CudaMatrix<float> m2(3, 3);
CudaMatrix<float> m3(3, 3);
CudaMatrix<float> deviceResult(3, 3);
// Set device memory values
m1.setValuesFromVector({ {1, 1, 1}, {1, 1, 1}, {1, 1, 1} });
m2.setValuesFromVector({ {1, 2, 3}, {4, 5, 6}, {7, 8, 9} });
m3.setValuesFromVector({ {1, 0, 0}, {0, 1, 0}, {0, 0, 1} });
// Test results (just showing it here)
CudaMatrix<float>::product(m1, m2, deviceResult, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
m1.display("m1");
m2.display("m2");
deviceResult.display("m1 X m2");
CudaMatrix<float>::product(m2, m3, deviceResult, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
m1.display("m2");
m2.display("m3");
deviceResult.display("m2 X m3");
// Clean up
cublasLtCk(cublasLtDestroy(cublasLtHandle));
m1.freeResources();
m2.freeResources();
m3.freeResources();
deviceResult.freeResources();
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(test-cuda)
# ------------------------------------------------ Compilation options ----------------------------------------------- #
# CUDA 10 does not support C++ 17
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
set(CMAKE_BUILD_TYPE Debug) # Release or Debug
# Include CUDA
find_package(CUDA REQUIRED)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_75 -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda")
# ----------------------------------------------------- Constants ---------------------------------------------------- #
if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release")
MESSAGE(STATUS "Debug build")
add_definitions(-DDEBUG_CUDA)
else ()
MESSAGE(STATUS "Release build")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3")
endif ()
# ------------------------------------------------- Source code files ------------------------------------------------ #
# All in one
file(GLOB matmul "cublaslt_mat_mul.cu")
# ---------------------------------------------------- Executables --------------------------------------------------- #
cuda_add_executable(matmulTest ${matmul})
# ---------------------------------------------------- Libraries ----------------------------------------------------- #
# Path to local libraries
file(GLOB CUDAlibs "/usr/lib/x86_64-linux-gnu/libcuda.so" "/usr/lib/x86_64-linux-gnu/libcublas.so" "/usr/lib/x86_64-linux-gnu/libcublasLt.so" "/usr/local/cuda/lib64/libcudart.so")
# Link libraries
target_link_libraries(matmulTest ${CUDAlibs})
Output
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 249 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
returnedAlgoCount = -768202864
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 256 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
Matrix m1 3 x 3 pixels of float
{ 1, 1, 1 }
{ 1, 1, 1 }
{ 1, 1, 1 }
Matrix m2 3 x 3 pixels of float
{ 1, 2, 3 }
{ 4, 5, 6 }
{ 7, 8, 9 }
Matrix m1 X m2 3 x 3 pixels of float
{ 0, 0, 0 }
{ 0, 0, 0 }
{ 0, 0, 0 }
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 249 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
returnedAlgoCount = -870514560
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 256 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
Matrix m2 3 x 3 pixels of float
{ 1, 1, 1 }
{ 1, 1, 1 }
{ 1, 1, 1 }
Matrix m3 3 x 3 pixels of float
{ 1, 0, 0 }
{ 0, 1, 0 }
{ 0, 0, 1 }
Matrix m2 X m3 3 x 3 pixels of float
{ 0, 0, 0 }
{ 0, 0, 0 }
{ 0, 0, 0 }

I did 2 mistakes
The matrixLayout was not properly set, I wrote a function to write it before each multiplication based on the op applied to the matrix.
Additionally I put the matrix memory row major instead of column major.
Now the code is working well for square and non square product and row major memory.
cublaslt_mat_mul.cu
#include <iostream>
#include <iomanip>
#include <limits>
#include <vector>
#include <cxxabi.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cublasLt.h>
// ****************************************************************************************************************** //
// ErrorsCheck.cuh //
// ****************************************************************************************************************** //
static const char* cublasGetErrorEnum(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
default:
return "<unknown>";
}
}
inline void cublasLtCheck(cublasStatus_t status, int iLine, const char *szFile) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "CublasLt error " << cublasGetErrorEnum(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
}
}
inline void cudaCheck(cudaError_t status, int iLine, const char *szFile) {
if (status != cudaSuccess) {
std::cerr << "CublasLt error " << cudaGetErrorString(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
}
}
#define cublasLtCk(call) cublasLtCheck(call, __LINE__, __FILE__)
#define cudaCk(call) cudaCheck(call, __LINE__, __FILE__)
// ****************************************************************************************************************** //
// CudaMatrix.cuh //
// ****************************************************************************************************************** //
#define MB 1048576 // 2^19 byte
typedef unsigned int uint;
template <typename precision>
struct CudaMatrix {
// Matrix multiplication GPU workspace that can be used to improve matrix multiplication computation time
const static void *matMulWorkspace;
const static size_t matMulWorkspaceSize;
CudaMatrix() : width(0), height(0), data(nullptr), cublasHandle(nullptr), cublasLtHandle(nullptr), matrixLayout(nullptr) { };
CudaMatrix(uint width, uint height, cublasHandle_t cublasHandle = nullptr, cublasLtHandle_t cublasLtHandle = nullptr,
cublasLtMatrixLayout_t matrixLayout = nullptr) : width(width), height(height), cublasHandle(cublasHandle),
cublasLtHandle(cublasLtHandle), matrixLayout(matrixLayout)
{
cudaCk(cudaMalloc(&data, bytesSize()));
if (typeid(precision).hash_code() == typeid(uint).hash_code()) {
cublasLtDataType = CUDA_R_8U;
} else if (typeid(precision).hash_code() == typeid(int).hash_code()) {
cublasLtDataType = CUDA_R_8I;
} else if (typeid(precision).hash_code() == typeid(float).hash_code()) {
cublasLtDataType = CUDA_R_32F;
} else if (typeid(precision).hash_code() == typeid(double).hash_code()) {
cublasLtDataType = CUDA_R_64F;
} else {
throw std::runtime_error("The datatype " + std::string(typeid(precision).name()) + " is not handled in CudaMatrix");
}
if (matMulWorkspace == nullptr) {
cudaCk(cudaMalloc(&matMulWorkspace, matMulWorkspaceSize));
}
}
__device__ __host__ uint size() const { return width * height; }
static void product(CudaMatrix &A, CudaMatrix &B, CudaMatrix &C, cublasOperation_t opA, cublasOperation_t opB, cublasLtHandle_t lightHandle);
void freeResources() { cudaCk(cudaFree(data)); cublasLtCk(cublasLtMatrixLayoutDestroy(matrixLayout)); }
void setMatrixLayout(cublasOperation_t op, cublasLtOrder_t matrixOrder = CUBLASLT_ORDER_ROW);
uint bytesSize() const { return size() * sizeof(precision); }
void setValuesFromVector(const std::vector<precision> &vector);
void setValuesFromVector(const std::vector<std::vector<precision>> &vectors);
void display(const std::string &name = "", uint x = 0, uint y = 0, uint roiWidth = 0, uint roiHeight = 0) const;
void product(CudaMatrix &A) { product(*this, A, *this, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle); }
precision *data;
uint width,
height;
cublasHandle_t cublasHandle;
cublasLtHandle_t cublasLtHandle;
cublasLtMatrixLayout_t matrixLayout;
cudaDataType_t cublasLtDataType;
};
template <typename precision> const size_t CudaMatrix<precision>::matMulWorkspaceSize = 500 * MB;
template <typename precision> const void* CudaMatrix<precision>::matMulWorkspace = nullptr;
// ****************************************************************************************************************** //
// CudaMatrix.cu //
// ****************************************************************************************************************** //
/**
* Display the matrix
*
* #tparam precision - The matrix precision
*
* #param name - The matrix name
*/
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name, uint x, uint y, uint roiWidth, uint roiHeight) const
{
precision *hostValues;
roiWidth == 0 ? roiWidth = width : roiWidth = roiWidth;
roiHeight == 0 ? roiHeight = height : roiHeight = roiHeight;
cudaCk(cudaMallocHost(&hostValues, bytesSize()));
cudaCk(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));
std::cout << std::setprecision(std::numeric_limits<precision>::digits10 + 1);
std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of "
<< abi::__cxa_demangle(typeid(precision).name(), nullptr, nullptr, nullptr)
<< "\n\n";
for (int i = y; i < y + roiHeight; ++i) {
std::cout << "{ ";
for (int j = x; j < x + roiWidth - 1; ++j) {
std::cout << *(hostValues + i * width + j) << ", ";
}
std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
}
std::cout << std::endl;
cudaCk(cudaFreeHost(hostValues));
}
/**
* Set the matrix values in device CUDA memory from a host standard 1D vector
*
* #tparam precision - The matrix precision
*
* #param vector - The values to set the device CUDA memory from
*/
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
{
cudaCk(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
}
/**
* Set the matrix values in device CUDA memory from a host standard 2D vector
*
* #tparam precision - The matrix precision
*
* #param vectors - The values to set the device CUDA memory from
*/
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<std::vector<precision>> &vectors)
{
std::vector<precision> buffer;
buffer.reserve(vectors.size() * vectors[0].size());
for (const auto &vector : vectors) {
buffer.insert(buffer.end(), vector.begin(), vector.end());
}
setValuesFromVector(buffer);
}
/**
* Set the matrix layout before matrix multiplication with row major memory by default
*
* #tparam precision - The matrix precision
*
* #param op - Operation to perform on matrix before multiplication (none, transpose or hermitian)
* #param matrixOrder - The matrix memory order (column or row DEFAULT row)
*/
template<typename precision>
void CudaMatrix<precision>:: setMatrixLayout(cublasOperation_t op, cublasLtOrder_t matrixOrder)
{
const uint m = (op == CUBLAS_OP_N ? height : width),
n = (op == CUBLAS_OP_N ? width : height);
cublasLtCk(cublasLtMatrixLayoutCreate(&matrixLayout, cublasLtDataType, m, n, height));
cublasLtCk(cublasLtMatrixLayoutSetAttribute(matrixLayout, CUBLASLT_MATRIX_LAYOUT_ORDER, &matrixOrder, sizeof(matrixOrder)));
}
/**
* Performs the matrix-matrix multiplication C = A x B
*
* #see https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
*
* #param A - The left matrix A
* #param B - The right matrix B
* #param C - The result matrix C
* #param opA - Operation to perform on matrix A before multiplication (none, transpose or hermitian)
* #param opB - Operation to perform on matrix B before multiplication (none, transpose or hermitian)
* #param lightHandle - cublasLt handle
*/
template<typename precision>
void CudaMatrix<precision>::product(CudaMatrix &A,
CudaMatrix &B,
CudaMatrix &C,
cublasOperation_t opA,
cublasOperation_t opB,
cublasLtHandle_t lightHandle
) {
const precision zero = 0,
one = 1;
const int requestedAlgoCount = 1;
cudaStream_t stream = nullptr;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
cublasLtMatmulDesc_t computeDesc;
int returnedAlgoCount;
// Set matrix pre-operation such as transpose if any
cublasLtCk(cublasLtMatmulDescCreate(&computeDesc, A.cublasLtDataType));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opB, sizeof(opB)));
// Set matrices layout
A.setMatrixLayout(opA);
B.setMatrixLayout(opB);
C.setMatrixLayout(CUBLAS_OP_N);
// Get the best algorithm to use
cublasLtCk(cublasLtMatmulPreferenceCreate(&preference));
cublasLtCk(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&CudaMatrix::matMulWorkspaceSize, sizeof(CudaMatrix::matMulWorkspaceSize)));
cublasLtCk(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, A.matrixLayout, B.matrixLayout,
C.matrixLayout, C.matrixLayout, preference, requestedAlgoCount, &heuristicResult, &returnedAlgoCount));
// Do the multiplication
cublasLtCk(cublasLtMatmul(lightHandle, computeDesc, &one, A.data, A.matrixLayout, B.data, B.matrixLayout, &zero,
C.data, C.matrixLayout, C.data, C.matrixLayout, &heuristicResult.algo,
&CudaMatrix::matMulWorkspace, CudaMatrix::matMulWorkspaceSize, stream));
// clean up
cublasLtCk(cublasLtMatmulPreferenceDestroy(preference));
cublasLtCk(cublasLtMatmulDescDestroy(computeDesc));
}
// Forward template declarations
template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;
template struct CudaMatrix<uint>;
// ****************************************************************************************************************** //
// main.cu //
// ****************************************************************************************************************** //
int main(int argc, char const *argv[])
{
cublasLtHandle_t cublasLtHandle = nullptr;
std::vector<float> r1Expect = { 6, 6, 6, 15, 15, 15, 24, 24, 24 };
std::vector<float> r2Expect = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
cublasLtCk(cublasLtCreate(&cublasLtHandle));
// Declare matrices
CudaMatrix<float> m1(3, 3);
CudaMatrix<float> m2(3, 3);
CudaMatrix<float> m3(3, 3);
CudaMatrix<float> m4(3, 2);
CudaMatrix<float> m5(2, 3);
CudaMatrix<float> deviceResult_2_2(2, 2);
CudaMatrix<float> deviceResult_3_3(3, 3);
// Set device memory values
m1.setValuesFromVector({ {1, 1, 1}, {1, 1, 1}, {1, 1, 1} });
m2.setValuesFromVector({ {1, 2, 3}, {4, 5, 6}, {7, 8, 9} });
m3.setValuesFromVector({ {1, 0, 0}, {0, 1, 0}, {0, 0, 1} });
m4.setValuesFromVector({ {1, 2, 3}, {4, 5, 6} });
m5.setValuesFromVector({ {1, 2}, { 3, 4 }, { 5 , 6 } });
// Test results (just showing it here)
CudaMatrix<float>::product(m1, m2, deviceResult_3_3, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_3_3.display("m1 X m2");
CudaMatrix<float>::product(m2, m3, deviceResult_3_3, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_3_3.display("m2 X m3");
CudaMatrix<float>::product(m4, m5, deviceResult_3_3, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_3_3.display("m4 X m5");
CudaMatrix<float>::product(m5, m4, deviceResult_2_2, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_2_2.display("m5 X m4");
// Clean up
cublasLtCk(cublasLtDestroy(cublasLtHandle));
m1.freeResources();
m2.freeResources();
m3.freeResources();
deviceResult_2_2.freeResources();
deviceResult_3_3.freeResources();
return 0;
}

OpenCl: sample float4 program - Segmentation fault (core dumped)

It is simple program that read two float4 vectors from files then calculate sum of opposite numbers.
I couldn't find the problem:
MAIN file:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
const int number_of_points = 16; // number of points in Both A and B files (number of rows)
const int number_of_axis = 4; // number of points axis in Both A and B files (number of Columns)
using namespace std;
void checkError(cl_int err, const char *operation)
{
if (err != CL_SUCCESS)
{
fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
exit(1);
}
}
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// working arrays
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);
float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
int mem_size_InputA = number_of_points * number_of_axis ;
int mem_size_InputB = number_of_points * number_of_axis ;
int mem_size_Output = number_of_points * number_of_axis ;
float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data
float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
tempAArray[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
tempBArray[row][col] = x;
col++;
}
row++;
}
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
{
float temporary = tempBArray[row_of_arrayB][2];
tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
tempBArray[row_of_arrayB][1] = temporary;
}
// from Array to 3d vectors
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
// {
// inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// }
for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
{
inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
inputAArray[row_of_array*number_of_points+3] = 0.0f;
inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];
outputArray[row_of_array*number_of_points+0] = 0.0f;
outputArray[row_of_array*number_of_points+1] = 0.0f;
outputArray[row_of_array*number_of_points+2] = 0.0f;
outputArray[row_of_array*number_of_points+3] = 0.0f;
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
}
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
// {
// printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
// inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
// }
// close input files
input_fileA.close();
input_fileB.close();
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputB*sizeof(cl_float4), NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output*sizeof(cl_float4), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE)
{
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:\n%s\n", log);
exit(1);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 4; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
// Display the result to the screen
// float buttomSNM = 0;
// for(i = 0; i < number_of_points; i++)
// {
// for (int t=0; t<4; t++)
// {
// cout << "h" ;
//// printf("%f, \n", outputArray[i*number_of_points+t]);
// }
// }
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free (inputAArray);
free (inputBArray);
free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
Kernel:
__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
__global float4 *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int number_of_points = 16;
outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];
}
The first input files: A.txt
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
12 1.200000e-02 9.998817e-01
13 1.300000e-02 9.998800e-01
14 1.400000e-02 9.998783e-01
15 1.500000e-02 9.998766e-01
The second input file B:
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
11 1.100000e-02 9.998801e-01
12 1.200000e-02 9.998785e-01
13 1.300000e-02 9.998767e-01
14 1.400000e-02 9.998750e-01
15 1.500000e-02 9.998732e-01
Thanks in advance

You are computing your array indices in your kernel in a fairly strange manner:
i*number_of_points+0
i*number_of_points+1
i*number_of_points+2
i*number_of_points+3
Think about what this actually translates to for different values of i (assuming number_of_points=16):
i array indices (i*16 + (0,1,2,3))
--------------------------------------
0 0, 1, 2, 3
1 16, 17, 18, 19
2 32, 33, 34, 35
...
etc
This is surely not what you wanted! Your sample code appears to just be trying to perform a vectorised vector addition. If that's the case, your kernel code just needs to look something like this:
__kernel void vecadd(__global float4 *inputA,
__global float4 *inputB,
__global float4 *output)
{
int i = get_global_id(0);
output[i] = inputA[i] + inputB[i];
}
This works because were are performing the same operation to each element of the vector. If you have a kernel that needs to use these elements separately, you would write code like this:
float4 valueA = inputA[i];
float4 valueB = inputB[i];
float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component

Random123 generating random numbers for opencl using visual studio

http://www.thesalmons.org/john/random123/releases/1.00/docs/index.html
I have a hard time looking at the example for opencl and random123 as im new to OpenCL and i am not sure how I can use the provided information when im using Visual Studio 2010.
Anyone who can compose a guide for generating random numbers with the above lib and using visual studio 2010.
UPDATE:
I solved it as following and are now wondering how do I change the seed such i get random numbers at each run.
int main(int argc, char **argv)
{
const char *kernelname = "counthits";
unsigned count =10000;
cl_int err;
cl::Context cl_context;
cl::Program program;
cl::Kernel cl_kernel;
cl::Buffer cl_out;
cl::CommandQueue cl_queue;
size_t i, nthreads, hits_sz;
size_t cores, work_group_size;
cl_uint2 * hits_host;
double d = 0.; // timer
d = timer(&d);
progname = argv[0];
std::vector< cl::Platform > platformList;
CHECK(cl::Platform::get(&platformList));
CHECKERR( cl_context = createCLContext(CL_DEVICE_TYPE_GPU,cl_vendor::VENDOR_AMD, &err) );
std::vector<cl::Device> devices;
CHECKERR( devices = cl_context.getInfo<CL_CONTEXT_DEVICES>(&err) );
size_t length = 0;
const char * sourceStr = loadFileToString("pi_opencl_kernel.ocl","",&length);
cl::Program::Sources sources(1, std::make_pair(sourceStr, length));
program = cl::Program(cl_context, sources);
CHECK( program.build(devices,"-I D:\\libs\\Random123\\1.06\\include") );
CHECKERR(work_group_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&err) );
CHECKERR(cores = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&err) );
cores *= 16*4; //Tahiti.
if (work_group_size > 64) work_group_size /= 2;
nthreads = cores * work_group_size*32; //2048*128 = 262144
if (count == 0)
count = NTRIES/nthreads; //38
printf("Count: %lu\n",count);
hits_sz = nthreads * sizeof(hits_host[0]);//2097152
CHECKNOTZERO(hits_host = (cl_uint2 *)malloc(hits_sz));
CHECKERR ( cl_out = cl::Buffer( cl_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hits_sz, hits_host, &err));
CHECKERR ( cl_kernel = cl::Kernel(program,kernelname,&err) );
CHECK ( cl_kernel.setArg( 0, count) );
CHECK ( cl_kernel.setArg( 1, cl_out) );
CHECKERR (cl_queue = cl::CommandQueue(cl_context, devices[0], 0, &err) );
cl::Event event;
CHECK( cl_queue.enqueueNDRangeKernel(cl_kernel,cl::NullRange,cl::NDRange(nthreads), cl::NDRange(work_group_size), NULL, &event) );
event.wait();
CHECK( cl_queue.enqueueReadBuffer(cl_out, CL_TRUE, 0,hits_sz, hits_host) );
unsigned long hits = 0, tries = 0;
for (i = 0; i < nthreads; i++) {
#ifdef _DEBUG
printf("%lu %u %u\n", (unsigned long)i, hits_host[i].s[0], hits_host[i].s[1]);
#endif
hits += hits_host[i].s[0];
tries += hits_host[i].s[1];
}
return pi_check(hits, tries);
}
Kernel:
#include <Random123/threefry.h>
/*
* counthits generates 2*n x,y points and returns hits[tid] with
* the count of number of those points within the unit circle on
* each thread.
*/
__kernel void counthits(unsigned n, __global uint2 *hitsp) {
unsigned tid = get_global_id(0);
unsigned hits = 0, tries = 0;
threefry4x32_key_t k = {{tid, 0xdecafbad, 0xfacebead, 0x12345678}};
threefry4x32_ctr_t c = {{0, 0xf00dcafe, 0xdeadbeef, 0xbeeff00d}};
while (tries < n) {
union {
threefry4x32_ctr_t c;
int4 i;
} u;
c.v[0]++;
u.c = threefry4x32(c, k);
long x1 = u.i.x, y1 = u.i.y;
long x2 = u.i.z, y2 = u.i.w;
if ((x1*x1 + y1*y1) < (1L<<62)) {
hits++;
}
tries++;
if ((x2*x2 + y2*y2) < (1L<<62)) {
hits++;
}
tries++;
}
hitsp[tid].x = hits;
hitsp[tid].y = tries;
}

I haven't tested this, but roughly speaking, something like the following:
Try changing the signature of counthits to:
_kernel void counthits(unsigned n, __global uint2 *hitsp, unsigned seed)
Replace 0xdecafbad with seed
Add
char *seedstr = getenv("COUNTHITS_SEED");
unsigned seed = seedstr ? atoi(seedstr) : 0xdecafbad;
...
CHECK ( cl_kernel.setArg( 2, seed) );
to the main program (this setArg comes after setArg( 1, ...), and you can, of).

Error with passing a pointer using threads

Updated code: 3/7/11 : 9:29pm
using namespace std;
void * matrixACreate(void * param);
void *status;
struct a
{
int Arow; // Matrix A
int Acol; // WxX
int low; // Range low
int high;
};
int main(int argc, char * argv[])
{
struct a matrix_mult_info;
matrix_mult_info.Arow = atoi(argv[1]); // Matrix A
matrix_mult_info.Acol = atoi(argv[2]); // WxX
matrix_mult_info.low = atoi(argv[5]); // Range low
matrix_mult_info.high = atoi(argv[6]);
pthread_t matrixAthread;
pthread_t runner;
int error, retValue;
struct a * a = (struct a *) malloc(sizeof(struct a));
error = pthread_create(&matrixAthread, NULL, matrixACreate, a );
//error = pthread_create(&matrixAthread, NULL, matrixBCreate, sendB);
retValue = pthread_join(matrixAthread, &status);
//retValue = pthread_join(matrixBthread, &status);
return 0;
}
void * matrixACreate(void * param) {
struct a * matrix = (struct a *) param;
int range = ((matrix->high - matrix->low) + 1);
cout << matrix->Arow << endl;
return 0;
}

struct a * a = (struct a *) malloc(sizeof(struct a));
// init a's members
error = pthread_create(&matrixAthread, NULL, matrixACreate, a);
EDIT: In response to updated question:
void * matrixACreate(void * param) {
struct a * matrix = (struct a *) param;
int range = ((matrix->high - matrix->low) + 1);
cout << matrix->Arow << endl;
return NULL;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

simple cuda graph example doesn't product expected result - c++

Related

C++ OpenCL Abstraction Not giving desired result

CublasLt cublasLtMatmulAlgoGetHeuristic returns CUBLAS_STATUS_INVALID_VALUE for rows major matrix

OpenCl: sample float4 program - Segmentation fault (core dumped)

Random123 generating random numbers for opencl using visual studio

Error with passing a pointer using threads

Categories

Resources