I tried to do an basic abstraction for OpenCL Here it is:
class OpenCLBuffer {
void* GetNativeID() { return obj; }
cl_mem obj;
cl_command_queue commandQueue;
cl_context context;
cl_int ret;
int type;
int maxSize;
int currSize;
OpenCLBuffer::OpenCLBuffer(cl_context cont, cl_command_queue queue, cl_int t, unsigned int size)
context = cont;
commandQueue = queue;
maxSize = size;
type = t;
obj = clCreateBuffer(context, t, size, NULL, &ret);
ret = clReleaseMemObject(obj);
void OpenCLBuffer::SetData(int size, void* data, int offset)
currSize = size;
ret = clEnqueueWriteBuffer(commandQueue, obj, CL_TRUE, offset, size, data, 0, NULL, NULL);
void OpenCLBuffer::GetData(void* data, int size)
if (size == -1)
size = currSize;
ret = clEnqueueReadBuffer(commandQueue, obj, CL_TRUE, 0, size, data, 0, NULL, NULL);
class OpenCLContext {
// I removed the fuc definations from question as they are already in the Impl part
cl_platform_id plarformId;
cl_device_id deviceId;
cl_context context;
cl_uint numDevices;
cl_uint numPlatforms;
cl_command_queue commandQueue;
cl_int ret;
char name[1024];
static void _stdcall OpenCLErrorFunc(const char* errinfo, const void* private_info, size_t cb, void* user_data){
std::cout << "OpenCL (" << user_data << ") Error : \n" << errinfo << "\n";
OpenCLContext::OpenCLContext(std::string n)
ret = clGetPlatformIDs(1, &plarformId, &numPlatforms);
ret = clGetDeviceIDs(plarformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceId, &numDevices);
context = clCreateContext(NULL, 1, &deviceId, OpenCLErrorFunc, name, &ret);
commandQueue = clCreateCommandQueue(context, deviceId, 0, &ret);
memcpy_s(name, 1024, n.data(), std::min(1024, (int)n.size()));
for (std::pair<std::string, char*> data : sources) {
if (data.second)
delete data.second;
ret = clFlush(commandQueue);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
OpenCLBuffer* OpenCLContext::CreateBuffer(void* data, int size, int type)
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
buffer->SetData(size, data);
return buffer;
OpenCLBuffer* OpenCLContext::CreateBuffer(int size, int type)
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
return buffer;
void OpenCLContext::AddProgram(std::string name, std::string source)
char* sc = new char[source.size()];
memcpy_s(sc, source.size(), source.data(), source.size());
sources[name] = sc;
int sourceSize = source.size();
programs[name] = clCreateProgramWithSource(context, 1, (const char**)&sc, (const size_t*)&sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
void OpenCLContext::MakeKernel(std::string programName, std::string kernelName)
kernels[kernelName] = clCreateKernel(programs[programName], kernelName.c_str(), &ret);
void OpenCLContext::SetKernelArg(std::string kernelName, int num, int size, void* arg)
ret = clSetKernelArg(kernels[kernelName], num, size, arg);
void OpenCLContext::ReleaseKernerl(std::string kernelName)
ret = clFlush(commandQueue);
ret = clReleaseKernel(kernels[kernelName]);
void OpenCLContext::ReleaseProgram(std::string programName)
ret = clFlush(commandQueue);
ret = clReleaseProgram(programs[programName]);
void OpenCLContext::Dispatch(std::string kernelName, int globalItemSize, int localItemSize)
ret = clEnqueueNDRangeKernel(commandQueue, kernels[kernelName], 1, NULL, (const size_t*)&globalItemSize, (const size_t*)&localItemSize, 0, NULL, NULL);
Driver Code:
std::string shadersrc = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
const int LIST_SIZE = 1024;
int* A = (int*)malloc(sizeof(int) * LIST_SIZE);
int* B = (int*)malloc(sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
context = new OpenCLContext("Vector Adder");
a = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
b = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
c = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_WRITE_ONLY);
a->SetData(LIST_SIZE * sizeof(int), A);
b->SetData(LIST_SIZE * sizeof(int), B);
context->AddProgram("VectorAdderSrc", shadersrc);
context->MakeKernel("VectorAdderSrc", "vector_add");
context->SetKernelArg("vector_add", 0, sizeof(cl_mem), a->GetNativeID());
context->SetKernelArg("vector_add", 1, sizeof(cl_mem), b->GetNativeID());
context->SetKernelArg("vector_add", 2, sizeof(cl_mem), c->GetNativeID());
context->Dispatch("vector_add", LIST_SIZE, 64);
int* C = (int*)malloc(sizeof(int) * LIST_SIZE);
memset(C, 0, sizeof(int) * LIST_SIZE);
c->GetData(c, sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
Sometimes i am getting Read Access Violation and sometimes:
0 + 1024 = 0
1 + 1023 = 0
2 + 1022 = 0
3 + 1021 = 0
Then crash.
Could you please help me find the problems?
First, few general tips to make debugging easier:
Always check return values. Every OpenCL API reports the errors via return value or a reference parameter.
At the first occurrence of an error you should stop the rest of the program as it will most likely not work. Throwing exceptions are a good strategy.
Specifically for OpenCL, the error codes are defined in the main header cl.h, you can find an code->string mapping routine here:Convenient way to show OpenCL error codes?
Regarding your code, the first error came from your function AddProgram. The function clCreateProgramWithSource returns CL_OUT_OF_HOST_MEMORY. Your decision to cast sourceSize from int* to size_t* is problematic since they are not the same size, and the API reads a corrupted 64-bit value.
Here is a better implementation:
void AddProgram(std::string name, std::string source)
const char* sc = source.c_str();
size_t sourceSize[1] = {source.size()};
programs[name] = clCreateProgramWithSource(context, 1, &sc, sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
There is no need to keep the source code in memory after it was compiled, but if you really want to, I suggest managing std::string objects because they save the hustle of managing memory.
The next problem is that clEnqueueNDRangeKernel returns CL_INVALID_WORK_GROUP_SIZE. Here you can find the int* to size_t* cast problem again, which passes bad arguments to the function.
Finally, your call to SetKernelArg returns CL_INVALID_MEM_OBJECT, and this is because the last argument, in case of OpenCL buffers, expected to be a pointer pointer to a cl_mem object (in your case - address of cl_mem that is returned by GetNativeID()).
Finally, there is a typo in the line c->GetData(c, sizeof(int) * LIST_SIZE); as it should be c->GetData(C, sizeof(int) * LIST_SIZE);
That should make it work. Please pay attention to the tips above, and avoid C casts in favor of C++ casts.
I've just finished to refactor my program to use cublasLt lib for GEMM and I fell into a CUBLAS_STATUS_INVALID_VALUE when executing cublasLtMatmulAlgoGetHeuristic in the function below.
* Performs the matrix-matrix multiplication C = A x B
* #see https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
* #param A - The left matrix A
* #param B - The right matrix B
* #param C - The result matrix C
* #param opA - Operation to perform on matrix A before multiplication (none, transpose or hermitian)
* #param opB - Operation to perform on matrix B before multiplication (none, transpose or hermitian)
* #param lightHandle - cublasLt handle
template<typename precision>
void CudaMatrix<precision>::product(const CudaMatrix &A,
const CudaMatrix &B,
CudaMatrix &C,
cublasOperation_t opA,
cublasOperation_t opB,
cublasLtHandle_t lightHandle
) {
const precision zero = 0,
one = 1;
const int requestedAlgoCount = 1;
cudaStream_t stream = nullptr;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
cublasLtMatmulDesc_t computeDesc;
int returnedAlgoCount;
// Set matrix pre-operation such as transpose if any
cublasLtCk(cublasLtMatmulDescCreate(&computeDesc, A.cublasLtDataType));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opB, sizeof(opB)));
// Get the best algorithm to use
cublasLtCk(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&CudaMatrix::matMulWorkspaceSize, sizeof(CudaMatrix::matMulWorkspaceSize)));
cublasLtCk(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, A.matrixLayout, B.matrixLayout,
C.matrixLayout, C.matrixLayout, preference, requestedAlgoCount, &heuristicResult, &returnedAlgoCount));
std::cout << "returnedAlgoCount = " << returnedAlgoCount << std::endl;
// Do the multiplication
cublasLtCk(cublasLtMatmul(lightHandle, computeDesc, &one, A.data, A.matrixLayout, B.data, B.matrixLayout, &zero,
C.data, C.matrixLayout, C.data, C.matrixLayout, &heuristicResult.algo,
&CudaMatrix::matMulWorkspace, CudaMatrix::matMulWorkspaceSize, stream));
// clean up
I concatenated a minimal reproducible example below with the same source code as I have in my program (with trims).
This error may be related to a bug I found in NVIDIA forum but I am not sure.
I'm running on Ubuntu 18.04 with RTX 5000 GPU.
#include <iostream>
#include <iomanip>
#include <limits>
#include <vector>
#include <cxxabi.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cublasLt.h>
// ****************************************************************************************************************** //
// ErrorsCheck.cuh //
// ****************************************************************************************************************** //
static const char* cublasGetErrorEnum(cublasStatus_t error)
switch (error)
return "<unknown>";
inline void cublasLtCheck(cublasStatus_t status, int iLine, const char *szFile) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "CublasLt error " << cublasGetErrorEnum(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
inline void cudaCheck(cudaError_t status, int iLine, const char *szFile) {
if (status != cudaSuccess) {
std::cerr << "CublasLt error " << cudaGetErrorString(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
#define cublasLtCk(call) cublasLtCheck(call, __LINE__, __FILE__)
#define cudaCk(call) cudaCheck(call, __LINE__, __FILE__)
// ****************************************************************************************************************** //
// CudaMatrix.cuh //
// ****************************************************************************************************************** //
#define MB 1048576 // 2^19 byte
typedef unsigned int uint;
template <typename precision>
struct CudaMatrix {
// Matrix multiplication GPU workspace that can be used to improve matrix multiplication computation time
const static void *matMulWorkspace;
const static size_t matMulWorkspaceSize;
CudaMatrix() : width(0), height(0), data(nullptr), cublasHandle(nullptr), cublasLtHandle(nullptr), matrixLayout(nullptr) { };
CudaMatrix(uint width, uint height, cublasHandle_t cublasHandle = nullptr, cublasLtHandle_t cublasLtHandle = nullptr,
cublasLtMatrixLayout_t matrixLayout = nullptr) : width(width), height(height), cublasHandle(cublasHandle),
cublasLtHandle(cublasLtHandle), matrixLayout(matrixLayout)
cudaCk(cudaMalloc(&data, bytesSize()));
if (typeid(precision).hash_code() == typeid(uint).hash_code()) {
cublasLtDataType = CUDA_R_8U;
} else if (typeid(precision).hash_code() == typeid(int).hash_code()) {
cublasLtDataType = CUDA_R_8I;
} else if (typeid(precision).hash_code() == typeid(float).hash_code()) {
cublasLtDataType = CUDA_R_32F;
} else if (typeid(precision).hash_code() == typeid(double).hash_code()) {
cublasLtDataType = CUDA_R_64F;
} else {
throw std::runtime_error("The datatype " + std::string(typeid(precision).name()) + " is not handled in CudaMatrix");
cublasLtCk(cublasLtMatrixLayoutCreate(&matrixLayout, cublasLtDataType, height, width, width));
if (matMulWorkspace == nullptr) {
cudaCk(cudaMalloc(&matMulWorkspace, matMulWorkspaceSize));
__device__ __host__ uint size() const { return width * height; }
static void product(const CudaMatrix &A, const CudaMatrix &B, CudaMatrix &C, cublasOperation_t opA, cublasOperation_t opB, cublasLtHandle_t lightHandle);
void freeResources() { cudaCk(cudaFree(data)); cublasLtCk(cublasLtMatrixLayoutDestroy(matrixLayout)); }
uint bytesSize() const { return size() * sizeof(precision); }
void setValuesFromVector(const std::vector<precision> &vector);
void setValuesFromVector(const std::vector<std::vector<precision>> &vectors);
void display(const std::string &name = "", uint x = 0, uint y = 0, uint roiWidth = 0, uint roiHeight = 0) const;
void product(const CudaMatrix &A) { product(*this, A, *this, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle); }
precision *data;
uint width,
cublasHandle_t cublasHandle;
cublasLtHandle_t cublasLtHandle;
cublasLtMatrixLayout_t matrixLayout;
cudaDataType_t cublasLtDataType;
template <typename precision> const size_t CudaMatrix<precision>::matMulWorkspaceSize = 500 * MB;
template <typename precision> const void* CudaMatrix<precision>::matMulWorkspace = nullptr;
// ****************************************************************************************************************** //
// CudaMatrix.cu //
// ****************************************************************************************************************** //
* Display the matrix
* #tparam precision - The matrix precision
* #param name - The matrix name
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name, uint x, uint y, uint roiWidth, uint roiHeight) const
precision *hostValues;
roiWidth == 0 ? roiWidth = width : roiWidth = roiWidth;
roiHeight == 0 ? roiHeight = height : roiHeight = roiHeight;
cudaCk(cudaMallocHost(&hostValues, bytesSize()));
cudaCk(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));
std::cout << std::setprecision(std::numeric_limits<precision>::digits10 + 1);
std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of "
<< abi::__cxa_demangle(typeid(precision).name(), nullptr, nullptr, nullptr)
<< "\n\n";
for (int i = y; i < y + roiHeight; ++i) {
std::cout << "{ ";
for (int j = x; j < x + roiWidth - 1; ++j) {
std::cout << *(hostValues + i * width + j) << ", ";
std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
std::cout << std::endl;
* Set the matrix values in device CUDA memory from a host standard 1D vector
* #tparam precision - The matrix precision
* #param vector - The values to set the device CUDA memory from
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
cudaCk(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
* Set the matrix values in device CUDA memory from a host standard 2D vector
* #tparam precision - The matrix precision
* #param vectors - The values to set the device CUDA memory from
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<std::vector<precision>> &vectors)
std::vector<precision> buffer;
buffer.reserve(vectors.size() * vectors[0].size());
for (const auto &vector : vectors) {
buffer.insert(buffer.end(), vector.begin(), vector.end());
* Performs the matrix-matrix multiplication C = A x B
* #see https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
* #param A - The left matrix A
* #param B - The right matrix B
* #param C - The result matrix C
* #param opA - Operation to perform on matrix A before multiplication (none, transpose or hermitian)
* #param opB - Operation to perform on matrix B before multiplication (none, transpose or hermitian)
* #param lightHandle - cublasLt handle
template<typename precision>
void CudaMatrix<precision>::product(const CudaMatrix &A,
const CudaMatrix &B,
CudaMatrix &C,
cublasOperation_t opA,
cublasOperation_t opB,
cublasLtHandle_t lightHandle
) {
const precision zero = 0,
one = 1;
const int requestedAlgoCount = 1;
cudaStream_t stream = nullptr;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
cublasLtMatmulDesc_t computeDesc;
int returnedAlgoCount;
// Set matrix pre-operation such as transpose if any
cublasLtCk(cublasLtMatmulDescCreate(&computeDesc, A.cublasLtDataType));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opB, sizeof(opB)));
// Get the best algorithm to use
cublasLtCk(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&CudaMatrix::matMulWorkspaceSize, sizeof(CudaMatrix::matMulWorkspaceSize)));
cublasLtCk(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, A.matrixLayout, B.matrixLayout,
C.matrixLayout, C.matrixLayout, preference, requestedAlgoCount, &heuristicResult, &returnedAlgoCount));
std::cout << "returnedAlgoCount = " << returnedAlgoCount << std::endl;
// Do the multiplication
cublasLtCk(cublasLtMatmul(lightHandle, computeDesc, &one, A.data, A.matrixLayout, B.data, B.matrixLayout, &zero,
C.data, C.matrixLayout, C.data, C.matrixLayout, &heuristicResult.algo,
&CudaMatrix::matMulWorkspace, CudaMatrix::matMulWorkspaceSize, stream));
// clean up
// Forward template declarations
template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;
template struct CudaMatrix<uint>;
// ****************************************************************************************************************** //
// main.cu //
// ****************************************************************************************************************** //
int main(int argc, char const *argv[])
cublasLtHandle_t cublasLtHandle = nullptr;
std::vector<float> r1Expect = { 6, 6, 6, 15, 15, 15, 24, 24, 24 };
std::vector<float> r2Expect = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
// Declare matrices
CudaMatrix<float> m1(3, 3);
CudaMatrix<float> m2(3, 3);
CudaMatrix<float> m3(3, 3);
CudaMatrix<float> deviceResult(3, 3);
// Set device memory values
m1.setValuesFromVector({ {1, 1, 1}, {1, 1, 1}, {1, 1, 1} });
m2.setValuesFromVector({ {1, 2, 3}, {4, 5, 6}, {7, 8, 9} });
m3.setValuesFromVector({ {1, 0, 0}, {0, 1, 0}, {0, 0, 1} });
// Test results (just showing it here)
CudaMatrix<float>::product(m1, m2, deviceResult, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult.display("m1 X m2");
CudaMatrix<float>::product(m2, m3, deviceResult, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult.display("m2 X m3");
// Clean up
return 0;
cmake_minimum_required(VERSION 3.10)
# ------------------------------------------------ Compilation options ----------------------------------------------- #
# CUDA 10 does not support C++ 17
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
set(CMAKE_BUILD_TYPE Debug) # Release or Debug
# Include CUDA
find_package(CUDA REQUIRED)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_75 -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda")
# ----------------------------------------------------- Constants ---------------------------------------------------- #
MESSAGE(STATUS "Debug build")
else ()
MESSAGE(STATUS "Release build")
endif ()
# ------------------------------------------------- Source code files ------------------------------------------------ #
# All in one
file(GLOB matmul "cublaslt_mat_mul.cu")
# ---------------------------------------------------- Executables --------------------------------------------------- #
cuda_add_executable(matmulTest ${matmul})
# ---------------------------------------------------- Libraries ----------------------------------------------------- #
# Path to local libraries
file(GLOB CUDAlibs "/usr/lib/x86_64-linux-gnu/libcuda.so" "/usr/lib/x86_64-linux-gnu/libcublas.so" "/usr/lib/x86_64-linux-gnu/libcublasLt.so" "/usr/local/cuda/lib64/libcudart.so")
# Link libraries
target_link_libraries(matmulTest ${CUDAlibs})
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 249 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
returnedAlgoCount = -768202864
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 256 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
Matrix m1 3 x 3 pixels of float
{ 1, 1, 1 }
{ 1, 1, 1 }
{ 1, 1, 1 }
Matrix m2 3 x 3 pixels of float
{ 1, 2, 3 }
{ 4, 5, 6 }
{ 7, 8, 9 }
Matrix m1 X m2 3 x 3 pixels of float
{ 0, 0, 0 }
{ 0, 0, 0 }
{ 0, 0, 0 }
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 249 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
returnedAlgoCount = -870514560
CublasLt error CUBLAS_STATUS_INVALID_VALUE at line 256 in file /home/rom1/Desktop/test_cuda/cublaslt_mat_mul.cu
Matrix m2 3 x 3 pixels of float
{ 1, 1, 1 }
{ 1, 1, 1 }
{ 1, 1, 1 }
Matrix m3 3 x 3 pixels of float
{ 1, 0, 0 }
{ 0, 1, 0 }
{ 0, 0, 1 }
Matrix m2 X m3 3 x 3 pixels of float
{ 0, 0, 0 }
{ 0, 0, 0 }
{ 0, 0, 0 }
I did 2 mistakes
The matrixLayout was not properly set, I wrote a function to write it before each multiplication based on the op applied to the matrix.
Additionally I put the matrix memory row major instead of column major.
Now the code is working well for square and non square product and row major memory.
#include <iostream>
#include <iomanip>
#include <limits>
#include <vector>
#include <cxxabi.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cublasLt.h>
// ****************************************************************************************************************** //
// ErrorsCheck.cuh //
// ****************************************************************************************************************** //
static const char* cublasGetErrorEnum(cublasStatus_t error)
switch (error)
return "<unknown>";
inline void cublasLtCheck(cublasStatus_t status, int iLine, const char *szFile) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "CublasLt error " << cublasGetErrorEnum(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
inline void cudaCheck(cudaError_t status, int iLine, const char *szFile) {
if (status != cudaSuccess) {
std::cerr << "CublasLt error " << cudaGetErrorString(status) << " at line " << iLine << " in file "
<< szFile << std::endl;
#define cublasLtCk(call) cublasLtCheck(call, __LINE__, __FILE__)
#define cudaCk(call) cudaCheck(call, __LINE__, __FILE__)
// ****************************************************************************************************************** //
// CudaMatrix.cuh //
// ****************************************************************************************************************** //
#define MB 1048576 // 2^19 byte
typedef unsigned int uint;
template <typename precision>
struct CudaMatrix {
// Matrix multiplication GPU workspace that can be used to improve matrix multiplication computation time
const static void *matMulWorkspace;
const static size_t matMulWorkspaceSize;
CudaMatrix() : width(0), height(0), data(nullptr), cublasHandle(nullptr), cublasLtHandle(nullptr), matrixLayout(nullptr) { };
CudaMatrix(uint width, uint height, cublasHandle_t cublasHandle = nullptr, cublasLtHandle_t cublasLtHandle = nullptr,
cublasLtMatrixLayout_t matrixLayout = nullptr) : width(width), height(height), cublasHandle(cublasHandle),
cublasLtHandle(cublasLtHandle), matrixLayout(matrixLayout)
cudaCk(cudaMalloc(&data, bytesSize()));
if (typeid(precision).hash_code() == typeid(uint).hash_code()) {
cublasLtDataType = CUDA_R_8U;
} else if (typeid(precision).hash_code() == typeid(int).hash_code()) {
cublasLtDataType = CUDA_R_8I;
} else if (typeid(precision).hash_code() == typeid(float).hash_code()) {
cublasLtDataType = CUDA_R_32F;
} else if (typeid(precision).hash_code() == typeid(double).hash_code()) {
cublasLtDataType = CUDA_R_64F;
} else {
throw std::runtime_error("The datatype " + std::string(typeid(precision).name()) + " is not handled in CudaMatrix");
if (matMulWorkspace == nullptr) {
cudaCk(cudaMalloc(&matMulWorkspace, matMulWorkspaceSize));
__device__ __host__ uint size() const { return width * height; }
static void product(CudaMatrix &A, CudaMatrix &B, CudaMatrix &C, cublasOperation_t opA, cublasOperation_t opB, cublasLtHandle_t lightHandle);
void freeResources() { cudaCk(cudaFree(data)); cublasLtCk(cublasLtMatrixLayoutDestroy(matrixLayout)); }
void setMatrixLayout(cublasOperation_t op, cublasLtOrder_t matrixOrder = CUBLASLT_ORDER_ROW);
uint bytesSize() const { return size() * sizeof(precision); }
void setValuesFromVector(const std::vector<precision> &vector);
void setValuesFromVector(const std::vector<std::vector<precision>> &vectors);
void display(const std::string &name = "", uint x = 0, uint y = 0, uint roiWidth = 0, uint roiHeight = 0) const;
void product(CudaMatrix &A) { product(*this, A, *this, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle); }
precision *data;
uint width,
cublasHandle_t cublasHandle;
cublasLtHandle_t cublasLtHandle;
cublasLtMatrixLayout_t matrixLayout;
cudaDataType_t cublasLtDataType;
template <typename precision> const size_t CudaMatrix<precision>::matMulWorkspaceSize = 500 * MB;
template <typename precision> const void* CudaMatrix<precision>::matMulWorkspace = nullptr;
// ****************************************************************************************************************** //
// CudaMatrix.cu //
// ****************************************************************************************************************** //
* Display the matrix
* #tparam precision - The matrix precision
* #param name - The matrix name
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name, uint x, uint y, uint roiWidth, uint roiHeight) const
precision *hostValues;
roiWidth == 0 ? roiWidth = width : roiWidth = roiWidth;
roiHeight == 0 ? roiHeight = height : roiHeight = roiHeight;
cudaCk(cudaMallocHost(&hostValues, bytesSize()));
cudaCk(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));
std::cout << std::setprecision(std::numeric_limits<precision>::digits10 + 1);
std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of "
<< abi::__cxa_demangle(typeid(precision).name(), nullptr, nullptr, nullptr)
<< "\n\n";
for (int i = y; i < y + roiHeight; ++i) {
std::cout << "{ ";
for (int j = x; j < x + roiWidth - 1; ++j) {
std::cout << *(hostValues + i * width + j) << ", ";
std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
std::cout << std::endl;
* Set the matrix values in device CUDA memory from a host standard 1D vector
* #tparam precision - The matrix precision
* #param vector - The values to set the device CUDA memory from
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
cudaCk(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
* Set the matrix values in device CUDA memory from a host standard 2D vector
* #tparam precision - The matrix precision
* #param vectors - The values to set the device CUDA memory from
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<std::vector<precision>> &vectors)
std::vector<precision> buffer;
buffer.reserve(vectors.size() * vectors[0].size());
for (const auto &vector : vectors) {
buffer.insert(buffer.end(), vector.begin(), vector.end());
* Set the matrix layout before matrix multiplication with row major memory by default
* #tparam precision - The matrix precision
* #param op - Operation to perform on matrix before multiplication (none, transpose or hermitian)
* #param matrixOrder - The matrix memory order (column or row DEFAULT row)
template<typename precision>
void CudaMatrix<precision>:: setMatrixLayout(cublasOperation_t op, cublasLtOrder_t matrixOrder)
const uint m = (op == CUBLAS_OP_N ? height : width),
n = (op == CUBLAS_OP_N ? width : height);
cublasLtCk(cublasLtMatrixLayoutCreate(&matrixLayout, cublasLtDataType, m, n, height));
cublasLtCk(cublasLtMatrixLayoutSetAttribute(matrixLayout, CUBLASLT_MATRIX_LAYOUT_ORDER, &matrixOrder, sizeof(matrixOrder)));
* Performs the matrix-matrix multiplication C = A x B
* #see https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
* #param A - The left matrix A
* #param B - The right matrix B
* #param C - The result matrix C
* #param opA - Operation to perform on matrix A before multiplication (none, transpose or hermitian)
* #param opB - Operation to perform on matrix B before multiplication (none, transpose or hermitian)
* #param lightHandle - cublasLt handle
template<typename precision>
void CudaMatrix<precision>::product(CudaMatrix &A,
CudaMatrix &B,
CudaMatrix &C,
cublasOperation_t opA,
cublasOperation_t opB,
cublasLtHandle_t lightHandle
) {
const precision zero = 0,
one = 1;
const int requestedAlgoCount = 1;
cudaStream_t stream = nullptr;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
cublasLtMatmulDesc_t computeDesc;
int returnedAlgoCount;
// Set matrix pre-operation such as transpose if any
cublasLtCk(cublasLtMatmulDescCreate(&computeDesc, A.cublasLtDataType));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
cublasLtCk(cublasLtMatmulDescSetAttribute(computeDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opB, sizeof(opB)));
// Set matrices layout
// Get the best algorithm to use
cublasLtCk(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&CudaMatrix::matMulWorkspaceSize, sizeof(CudaMatrix::matMulWorkspaceSize)));
cublasLtCk(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, A.matrixLayout, B.matrixLayout,
C.matrixLayout, C.matrixLayout, preference, requestedAlgoCount, &heuristicResult, &returnedAlgoCount));
// Do the multiplication
cublasLtCk(cublasLtMatmul(lightHandle, computeDesc, &one, A.data, A.matrixLayout, B.data, B.matrixLayout, &zero,
C.data, C.matrixLayout, C.data, C.matrixLayout, &heuristicResult.algo,
&CudaMatrix::matMulWorkspace, CudaMatrix::matMulWorkspaceSize, stream));
// clean up
// Forward template declarations
template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;
template struct CudaMatrix<uint>;
// ****************************************************************************************************************** //
// main.cu //
// ****************************************************************************************************************** //
int main(int argc, char const *argv[])
cublasLtHandle_t cublasLtHandle = nullptr;
std::vector<float> r1Expect = { 6, 6, 6, 15, 15, 15, 24, 24, 24 };
std::vector<float> r2Expect = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
// Declare matrices
CudaMatrix<float> m1(3, 3);
CudaMatrix<float> m2(3, 3);
CudaMatrix<float> m3(3, 3);
CudaMatrix<float> m4(3, 2);
CudaMatrix<float> m5(2, 3);
CudaMatrix<float> deviceResult_2_2(2, 2);
CudaMatrix<float> deviceResult_3_3(3, 3);
// Set device memory values
m1.setValuesFromVector({ {1, 1, 1}, {1, 1, 1}, {1, 1, 1} });
m2.setValuesFromVector({ {1, 2, 3}, {4, 5, 6}, {7, 8, 9} });
m3.setValuesFromVector({ {1, 0, 0}, {0, 1, 0}, {0, 0, 1} });
m4.setValuesFromVector({ {1, 2, 3}, {4, 5, 6} });
m5.setValuesFromVector({ {1, 2}, { 3, 4 }, { 5 , 6 } });
// Test results (just showing it here)
CudaMatrix<float>::product(m1, m2, deviceResult_3_3, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_3_3.display("m1 X m2");
CudaMatrix<float>::product(m2, m3, deviceResult_3_3, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_3_3.display("m2 X m3");
CudaMatrix<float>::product(m4, m5, deviceResult_3_3, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_3_3.display("m4 X m5");
CudaMatrix<float>::product(m5, m4, deviceResult_2_2, CUBLAS_OP_N, CUBLAS_OP_N, cublasLtHandle);
deviceResult_2_2.display("m5 X m4");
// Clean up
return 0;
It is simple program that read two float4 vectors from files then calculate sum of opposite numbers.
I couldn't find the problem:
MAIN file:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#include <CL/cl.h>
#include <time.h>
const int number_of_points = 16; // number of points in Both A and B files (number of rows)
const int number_of_axis = 4; // number of points axis in Both A and B files (number of Columns)
using namespace std;
void checkError(cl_int err, const char *operation)
if (err != CL_SUCCESS)
fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// working arrays
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);
float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
int mem_size_InputA = number_of_points * number_of_axis ;
int mem_size_InputB = number_of_points * number_of_axis ;
int mem_size_Output = number_of_points * number_of_axis ;
float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data
float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
// import input files
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
istringstream streamA(line);
col = 0;
while(streamA >> x){
tempAArray[row][col] = x;
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
istringstream streamB(line);
col = 0;
while(streamB >> x){
tempBArray[row][col] = x;
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
float temporary = tempBArray[row_of_arrayB][2];
tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
tempBArray[row_of_arrayB][1] = temporary;
// from Array to 3d vectors
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
// {
// inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// }
for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
inputAArray[row_of_array*number_of_points+3] = 0.0f;
inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];
outputArray[row_of_array*number_of_points+0] = 0.0f;
outputArray[row_of_array*number_of_points+1] = 0.0f;
outputArray[row_of_array*number_of_points+2] = 0.0f;
outputArray[row_of_array*number_of_points+3] = 0.0f;
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
// {
// printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
// inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
// }
// close input files
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputB*sizeof(cl_float4), NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output*sizeof(cl_float4), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:\n%s\n", log);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 4; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
// Display the result to the screen
// float buttomSNM = 0;
// for(i = 0; i < number_of_points; i++)
// {
// for (int t=0; t<4; t++)
// {
// cout << "h" ;
//// printf("%f, \n", outputArray[i*number_of_points+t]);
// }
// }
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free (inputAArray);
free (inputBArray);
free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
__global float4 *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int number_of_points = 16;
outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];
The first input files: A.txt
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
12 1.200000e-02 9.998817e-01
13 1.300000e-02 9.998800e-01
14 1.400000e-02 9.998783e-01
15 1.500000e-02 9.998766e-01
The second input file B:
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
11 1.100000e-02 9.998801e-01
12 1.200000e-02 9.998785e-01
13 1.300000e-02 9.998767e-01
14 1.400000e-02 9.998750e-01
15 1.500000e-02 9.998732e-01
Thanks in advance
You are computing your array indices in your kernel in a fairly strange manner:
Think about what this actually translates to for different values of i (assuming number_of_points=16):
i array indices (i*16 + (0,1,2,3))
0 0, 1, 2, 3
1 16, 17, 18, 19
2 32, 33, 34, 35
This is surely not what you wanted! Your sample code appears to just be trying to perform a vectorised vector addition. If that's the case, your kernel code just needs to look something like this:
__kernel void vecadd(__global float4 *inputA,
__global float4 *inputB,
__global float4 *output)
int i = get_global_id(0);
output[i] = inputA[i] + inputB[i];
This works because were are performing the same operation to each element of the vector. If you have a kernel that needs to use these elements separately, you would write code like this:
float4 valueA = inputA[i];
float4 valueB = inputB[i];
float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component
I have a hard time looking at the example for opencl and random123 as im new to OpenCL and i am not sure how I can use the provided information when im using Visual Studio 2010.
Anyone who can compose a guide for generating random numbers with the above lib and using visual studio 2010.
I solved it as following and are now wondering how do I change the seed such i get random numbers at each run.
int main(int argc, char **argv)
const char *kernelname = "counthits";
unsigned count =10000;
cl_int err;
cl::Context cl_context;
cl::Program program;
cl::Kernel cl_kernel;
cl::Buffer cl_out;
cl::CommandQueue cl_queue;
size_t i, nthreads, hits_sz;
size_t cores, work_group_size;
cl_uint2 * hits_host;
double d = 0.; // timer
d = timer(&d);
progname = argv[0];
std::vector< cl::Platform > platformList;
CHECKERR( cl_context = createCLContext(CL_DEVICE_TYPE_GPU,cl_vendor::VENDOR_AMD, &err) );
std::vector<cl::Device> devices;
CHECKERR( devices = cl_context.getInfo<CL_CONTEXT_DEVICES>(&err) );
size_t length = 0;
const char * sourceStr = loadFileToString("pi_opencl_kernel.ocl","",&length);
cl::Program::Sources sources(1, std::make_pair(sourceStr, length));
program = cl::Program(cl_context, sources);
CHECK( program.build(devices,"-I D:\\libs\\Random123\\1.06\\include") );
CHECKERR(work_group_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&err) );
CHECKERR(cores = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&err) );
cores *= 16*4; //Tahiti.
if (work_group_size > 64) work_group_size /= 2;
nthreads = cores * work_group_size*32; //2048*128 = 262144
if (count == 0)
count = NTRIES/nthreads; //38
printf("Count: %lu\n",count);
hits_sz = nthreads * sizeof(hits_host[0]);//2097152
CHECKNOTZERO(hits_host = (cl_uint2 *)malloc(hits_sz));
CHECKERR ( cl_out = cl::Buffer( cl_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hits_sz, hits_host, &err));
CHECKERR ( cl_kernel = cl::Kernel(program,kernelname,&err) );
CHECK ( cl_kernel.setArg( 0, count) );
CHECK ( cl_kernel.setArg( 1, cl_out) );
CHECKERR (cl_queue = cl::CommandQueue(cl_context, devices[0], 0, &err) );
cl::Event event;
CHECK( cl_queue.enqueueNDRangeKernel(cl_kernel,cl::NullRange,cl::NDRange(nthreads), cl::NDRange(work_group_size), NULL, &event) );
CHECK( cl_queue.enqueueReadBuffer(cl_out, CL_TRUE, 0,hits_sz, hits_host) );
unsigned long hits = 0, tries = 0;
for (i = 0; i < nthreads; i++) {
#ifdef _DEBUG
printf("%lu %u %u\n", (unsigned long)i, hits_host[i].s[0], hits_host[i].s[1]);
hits += hits_host[i].s[0];
tries += hits_host[i].s[1];
return pi_check(hits, tries);
#include <Random123/threefry.h>
* counthits generates 2*n x,y points and returns hits[tid] with
* the count of number of those points within the unit circle on
* each thread.
__kernel void counthits(unsigned n, __global uint2 *hitsp) {
unsigned tid = get_global_id(0);
unsigned hits = 0, tries = 0;
threefry4x32_key_t k = {{tid, 0xdecafbad, 0xfacebead, 0x12345678}};
threefry4x32_ctr_t c = {{0, 0xf00dcafe, 0xdeadbeef, 0xbeeff00d}};
while (tries < n) {
union {
threefry4x32_ctr_t c;
int4 i;
} u;
u.c = threefry4x32(c, k);
long x1 = u.i.x, y1 = u.i.y;
long x2 = u.i.z, y2 = u.i.w;
if ((x1*x1 + y1*y1) < (1L<<62)) {
if ((x2*x2 + y2*y2) < (1L<<62)) {
hitsp[tid].x = hits;
hitsp[tid].y = tries;
I haven't tested this, but roughly speaking, something like the following:
Try changing the signature of counthits to:
_kernel void counthits(unsigned n, __global uint2 *hitsp, unsigned seed)
Replace 0xdecafbad with seed
char *seedstr = getenv("COUNTHITS_SEED");
unsigned seed = seedstr ? atoi(seedstr) : 0xdecafbad;
CHECK ( cl_kernel.setArg( 2, seed) );
to the main program (this setArg comes after setArg( 1, ...), and you can, of).