Why does OpenCL fail for enqueueReadBuffer into 2d vector? - c++

I'm not sure if this is a bug, or if I've just made a simple mistake, but it appears that reading an OpenCL buffer into a 2d vector causes some weird behaviour.
By that I mean that it either segfaults or crashes with a msg such as "corrupted double-linked list" or "free(): invalid size" after the queue.enqueueReadBuffer call. I can provide the backtrace / memory map if it would help.
Reading into a 1d vector works as expected.
I am running Linux 3.8.0-35 (x86_64) with AMD Catalyst 13.25.5 and I compiled this using: g++ -I/opt/AMDAPP/include main.cpp OpenCl.cpp -lOpenCL
Minimal working example:
main.cpp
// System headers
#include <iostream>
#include <vector>
// Third-party headers
#include <CL/cl.hpp>
// Project headers
#include "OpenCl.h"
int main(int argc, char* argv[])
{
OpenCl opencl;
const unsigned int num_rows = 241;
const unsigned int num_cols = 886;
const unsigned int num_elements = num_rows * num_cols;
const size_t array_sz = num_elements * sizeof(cl_float);
const std::vector<cl_float> A_1d(num_elements, 1.2345f);
std::vector<cl_float> B_1d(num_elements, 0);
const std::vector<std::vector<cl_float> > A_2d(num_rows, std::vector<cl_float>(num_cols, 1.2345f));
std::vector<std::vector<cl_float> > B_2d(num_rows, std::vector<cl_float>(num_cols, 0));
// Works as expected
std::cout << "START 1D TEST\n";
opencl.test1D(A_1d, B_1d, array_sz);
std::cout << "1D TEST COMPLETE\n";
// Crashes
std::cout << "START 2D TEST\n";
opencl.test2D(A_2d, B_2d, array_sz);
std::cout << "2D TEST COMPLETE\n";
return 0;
}
OpenCl.h
#pragma once
#define __CL_ENABLE_EXCEPTIONS
// Third-party headers
#include <CL/cl.hpp>
class OpenCl {
public:
OpenCl();
void test1D(const std::vector<cl_float> &A,
std::vector<cl_float> &B,
const size_t array_sz);
void test2D(const std::vector<std::vector<cl_float> > &A,
std::vector<std::vector<cl_float> > &B,
const size_t array_sz);
private:
cl::Context context;
cl::CommandQueue queue;
};
OpenCl.cpp
// Class header
#include "OpenCl.h"
// System headers
#include <iostream>
#include <vector>
// Third-party headers
#include <CL/cl.hpp>
OpenCl::OpenCl()
{
// Get available platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
// Select the default platform and create a context using the GPU
cl_context_properties cps[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
queue = cl::CommandQueue(context, devices[0]);
}
void OpenCl::test1D(const std::vector<cl_float> &A,
std::vector<cl_float> &B,
const size_t array_sz)
{
try {
// Initialize device buffer
cl::Buffer A_d = cl::Buffer(context, CL_MEM_READ_ONLY, array_sz);
// Transfer data to device
queue.enqueueWriteBuffer(A_d, CL_TRUE, 0, array_sz, &A[0]);
// Transfer data from device
std::cout << "B[0]: " << B[0] << "\n";
queue.enqueueReadBuffer(A_d, CL_TRUE, 0, array_sz, &B[0]);
std::cout << "B[0]: " << B[0] << "\n";
} catch(cl::Error &error) {
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
std::cout << "Program failed!\n";
}
}
void OpenCl::test2D(const std::vector<std::vector<cl_float> > &A,
std::vector<std::vector<cl_float> > &B,
const size_t array_sz)
{
try {
// Initialize device buffer
cl::Buffer A_d = cl::Buffer(context, CL_MEM_READ_ONLY, array_sz);
// Transfer data to device
queue.enqueueWriteBuffer(A_d, CL_TRUE, 0, array_sz, &A[0][0]);
// Transfer data from device
std::cout << "B[0][0]: " << B[0][0] << "\n";
queue.enqueueReadBuffer(A_d, CL_TRUE, 0, array_sz, &B[0][0]);
std::cout << "B[0][0]: " << B[0][0] << "\n";
} catch(cl::Error &error) {
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
std::cout << "Program failed!\n";
}
}

Memory held by std::vector< std::vector<float> > is not continuos, so you can not copy it in one operation. You would have to
size_t row_size = A[0].size() * sizeof(A[0][0]);
for(size_t row = 0; row < A.size(); ++row)
queue.enqueueWriteBuffer(A_d, CL_TRUE, /*offset=*/row * row_size, /*size=*/row_size, &A[row][0]);
But you would be better off in terms of performance if your data was laid out in continuous array anyway (as in your test1D).

Related

wcstombs_s is undefiend (gcc, m1 mac)

I'm trying to make my own programming language and compiler for it. While creating lexical analyzer i have troubles with this function. Also i included stdlib.h and stdio.h. Platform what i use is m1 mac and vscode with gcc
There is code:
#define _CRT_SECURE_NO_WARNINGS
#include "Log.h"
namespace Log
{
LOG getlog(wchar_t logfile[])
{
try
{
LOG struc;
struc.stream = new (std::nothrow) std::ofstream;
if (!struc.stream)
throw ERROR_THROW(112);
struc.stream->open(logfile);
if (!(*(struc.stream)).is_open())
throw ERROR_THROW(112);
wcscpy(struc.logfile, logfile);
return struc;
}
catch (Error::ERROR e)
{
Error::ErrorDisplay(e);
}
}
void WriteLine(LOG log, char *c, ...)
{
for (char **c_ptr = &c; **c_ptr; c_ptr++)
{
*log.stream << *c_ptr;
}
}
void WriteLine(LOG log, wchar_t *c, ...)
{
unsigned lineLength = 0;
char *line = nullptr;
unsigned int n = 0;
for (wchar_t **c_ptr = &c; **c_ptr; c_ptr++)
{
lineLength = wcslen(*c_ptr);
line = new char[lineLength + 1];
wcstombs_s(&n, line, lineLength + 1u, *c_ptr, lineLength + 1u);
*log.stream << line;
delete[] line;
}
}
void WriteLog(LOG log)
{
std::string line = "----Протокол------ ";
time_t result = time(NULL);
char str[26];
ctime_s(str, sizeof str, &result);
for (int i = 0; i < 26; i++)
line += str[i];
*log.stream << line;
}
void WriteParm(LOG log, Parm::PARM parm)
{
char in_text[PARM_MAX_SIZE];
char out_text[PARM_MAX_SIZE];
char log_text[PARM_MAX_SIZE];
wcstombs(in_text, parm.in, PARM_MAX_SIZE);
wcstombs(out_text, parm.out, PARM_MAX_SIZE);
wcstombs(log_text, parm.log, PARM_MAX_SIZE);
*log.stream << "---- Параметры ---- \n-in: " << in_text
<< "\n-out: " << out_text
<< "\n-log: " << log_text;
}
void WriteIn(LOG log, In::IN in)
{
*log.stream << "\n----Исходные данные------\nКол-во символов: " << (in.size < 0 ? 0 : in.size) << std::endl
<< "Проигнорировано: " << (in.ignor < 0 ? 0 : in.ignor) << std::endl
<< "Кол-во строк: " << (in.lines < 0 ? 0 : in.lines) << std::endl;
}
void WriteError(LOG log, Error::ERROR error)
{
*log.stream << "Ошибка " << error.id << ": " << error.message << ", Строка " << error.inext.line << ", Позиция " << error.inext.col << std::endl;
}
void Close(LOG log)
{
(*log.stream).close();
delete log.stream;
}
}
And header file:
#pragma once
#include <fstream>
#include <iostream>
#include <cwchar>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include "In.h"
#include "Parm.h"
#include "Error.h"
namespace Log
{
struct LOG
{
wchar_t logfile[PARM_MAX_SIZE];
std::ofstream *stream;
};
static const LOG INITLOG{L"", NULL};
LOG getlog(wchar_t logfile[]);
void WriteLine(LOG log, char *c, ...);
void WriteLine(LOG log, wchar_t *c, ...);
void WriteLog(LOG log);
void WriteParm(LOG log, Parm::PARM parm);
void WriteIn(LOG log, In::IN in);
void WriteError(LOG log, Error::ERROR error);
void Close(LOG log);
}
I tried to change include path and change versions of c++, std and compilers

Issue with memory allocation in beginner OpenCL code

I am trying to run a beginner level OpenCL test using an Intel CPU and integrated Iris graphics. I am compiling the code using the standard g++ and -framework OpenCL as a compile switch. I've tried sanitizing the code by running with gdb and referring a few guides online. But, I'm still seeing an error and I am supposing this is related to memory allocation. I have pasted my entire code below; please help if you see anything glaringly wrong.
Apologies for the verbose comments. Let me know if I have some wrong assumptions there as well :)
#include <iostream>
#include <OpenCL/opencl.h>
#include <cassert>
// the kernel that we want to execute on the device.
// here, you are doing an addition of elements in an array.
const char* kernelAdd =
{
"__kernel void add (global int* data)\n"
"{\n"
" int work_item_id = get_global_id(0);\n"
" data[work_item_id] *= 2;\n"
"}\n"
};
int main (int argc, char* argv[])
{
cl_int ret_val;
// getting the platform ID that can used - here we are getting only one
cl_platform_id platformID;
cl_uint numPlatforms;
if((clGetPlatformIDs(1, &platformID, &numPlatforms)))
std::cout << "clGetPlatformIDs failed!" << std::endl;
// getting OpenCL device ID for our GPU - here too, we are getting only one
cl_device_id deviceID;
cl_uint numDevices;
if((clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, &numDevices)))
std::cout << "clGetDeviceIDs failed!" << std::endl;
// printing out some device info. here we have chosen CL_DEVICE_NAME.
// you can choose any others by referring
// https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.html
typedef char typeInfo;
size_t sizeInfo = 16*sizeof(typeInfo);
typeInfo* deviceInfo = new typeInfo(sizeInfo);
if((clGetDeviceInfo(deviceID, CL_DEVICE_NAME, sizeInfo, (void*) deviceInfo, NULL)))
std::cout << "clGetDeviceInfo failed!" << std::endl;
std::cout << "CL_DEVICE_NAME = " << deviceInfo << ", platform ID = ";
std::cout << platformID << ", deviceID = " << deviceID << std::endl;
// set up a context for our device
cl_context_properties contextProp[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformID, 0};
cl_context context = clCreateContext(contextProp, 1, &deviceID, NULL, NULL, &ret_val);
if (ret_val)
std::cout << "clCreateContext failed!" << std::endl;
// set up a queue for our device
cl_command_queue queue = clCreateCommandQueue(context, deviceID, (cl_command_queue_properties) NULL, &ret_val);
if (ret_val)
std::cout << "clCreateCommandQueue failed!" << std::endl;
// creating our data set that we want to compute on
int N = 1 << 4;
size_t data_size = sizeof(int) * N;
int* input_data = new int(N);
int* output_data = new int(N);
for (int i = 0; i < data_size; i++)
{
input_data[i] = rand() % 1000;
}
// create a buffer to where you will eventually enqueue the program for the device
cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_size, NULL, &ret_val);
if (ret_val)
std::cout << "clCreateBuffer failed!" << std::endl;
// copying our data set to the buffer
if((clEnqueueWriteBuffer(queue, buffer, CL_TRUE, 0, data_size, input_data, 0, NULL, NULL)))
std::cout << "clEnqueueWriteBuffer failed!" << std::endl;
// we compile the device program with our source above and create a kernel for it.
// also, we are allowed to create a device program with a binary that we can point to.
cl_program program = clCreateProgramWithSource(context, 1, (const char**) &kernelAdd, NULL, &ret_val);
if (ret_val)
std::cout << "clCreateProgramWithSource failed!" << std::endl;
if((clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL)))
std::cout << "clBuildProgram failed!" << std::endl;
cl_kernel kernel = clCreateKernel(program, "add", &ret_val);
if (ret_val)
std::cout << "clCreateKernel failed! ret_val = " << ret_val << std::endl;
// configure options to find the arguments to the kernel
if((clSetKernelArg(kernel, 0, sizeof(buffer), &buffer)))
std::cout << "clSetKernelArg failed!" << std::endl;
// the total number of work items that we want to use
const size_t global_dimensions[3] = {data_size, 0, 0};
if((clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_dimensions, NULL, 0, NULL, NULL)))
std::cout << "clEnqueueNDRangeKernel failed!" << std::endl;
// read back output into another buffer
ret_val = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size, output_data, 0, NULL, NULL);
if(ret_val)
std::cout << "clEnqueueReadBuffer failed! ret_val = " << ret_val << std::endl;
std::cout << "Kernel completed" << std::endl;
// Release kernel, program, and memory objects
if(clReleaseMemObject(buffer))
std::cout << "clReleaseMemObject failed!" << std::endl;
if(clReleaseKernel(kernel))
std::cout << "clReleaseKernel failed!" << std::endl;
if(clReleaseProgram(program))
std::cout << "clReleaseProgram failed!" << std::endl;
if(clReleaseCommandQueue(queue))
std::cout << "clReleaseCommandQueue failed!" << std::endl;
if(clReleaseContext(context))
std::cout << "clReleaseContext failed!" << std::endl;
for (int i = 0; i < data_size; i++)
{
assert(output_data[i] == input_data[i]/2);
}
return 0;
}
The output is as follows:
CL_DEVICE_NAME = Iris, platform ID = 0x7fff0000, deviceID = 0x1024500
objc[1034]: Method cache corrupted. This may be a message to an invalid object, or a memory error somewhere else.
objc[1034]: receiver 0x7fefb8712a90, SEL 0x7fff7ce87c58, isa 0x7fff99268208, cache 0x7fff99268218, buckets 0x7fefb87043c0, mask 0x3, occupied 0x1
objc[1034]: receiver 48 bytes, buckets 64 bytes
objc[1034]: selector 'dealloc'
objc[1034]: isa 'OS_xpc_array'
objc[1034]: Method cache corrupted. This may be a message to an invalid object, or a memory error somewhere else.
make: *** [all] Abort trap: 6
Quite a common mistake
int* input_data = new int(N);
should be
int* input_data = new int[N];
Your version allocates a single int and initialises it to N. To allocate N integers you need square brackets.

Libfreenect: Why doesn't the depth callback function execute?

My problem is that is access the kinect with the following code:
#include "libfreenect.hpp"
#include <iostream>
freenect_context* ctx;
freenect_device* dev;
void freenect_threadfunc(freenect_device* dev, void* v_depth, uint32_t timestamp){
short* d = (short*) v_depth;
std::cout << d[0] << std::endl;
}
int main(int argc, char const *argv[])
{
if(freenect_init(&ctx, NULL) < 0){
std::cout << "freenect_init() failed!" << std::endl;
}
if (freenect_open_device(ctx, &dev, 0) < 0){
std::cout << "No device found!" << std::endl;
freenect_shutdown(ctx);
}
freenect_set_depth_callback(dev, freenect_threadfunc);
freenect_set_depth_mode(dev, freenect_find_depth_mode(FREENECT_RESOLUTION_MEDIUM, FREENECT_DEPTH_11BIT));
freenect_start_depth(dev);
while (true) {
}
return 0;
}
But for some reason i don't know, the callback function ´freenect_threadfunc´ doesn't execute. When executing freenect-glview which is a example provided by Openkinect, everything works fine.
Thank you for your help.

clang+llvm provides a bad function pointers for JIT compiled functions

I am trying to use clang+llvm 3.6 to JIT compile several C functions (each can eventually be very large).
Unfortunately I the function pointer that LLVM provides makes the program SEGFAULT.
So far I have following code:
#include <iostream>
#include <clang/CodeGen/CodeGenAction.h>
#include <clang/Basic/DiagnosticOptions.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/Basic/SourceManager.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/CompilerInvocation.h>
#include <clang/Frontend/FrontendDiagnostic.h>
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <clang/Frontend/Utils.h>
#include <clang/Parse/ParseAST.h>
#include <clang/Lex/Preprocessor.h>
#include <llvm/Analysis/Passes.h>
#include <llvm/ExecutionEngine/SectionMemoryManager.h>
#include <llvm/ExecutionEngine/MCJIT.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
#include <llvm/IR/Verifier.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/Bitcode/ReaderWriter.h>
#include <llvm/Support/ManagedStatic.h>
#include <llvm/Support/MemoryBuffer.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/raw_os_ostream.h>
#include <llvm/Linker/Linker.h>
int main(int argc, char *argv[]) {
using namespace llvm;
using namespace clang;
static const char* clangArgv [] = {"program", "-x", "c", "string-input"};
static const int clangArgc = sizeof (clangArgv) / sizeof (clangArgv[0]);
// C functions to be compiled (they could eventually be extremely large)
std::map<std::string, std::string> func2Source;
func2Source["getOne"] = "int getOne() {return 1;}";
func2Source["getTwo"] = "int getTwo() {return 2;}";
llvm::InitializeAllTargets();
llvm::InitializeAllAsmPrinters();
std::unique_ptr<llvm::Linker> linker;
std::unique_ptr<llvm::LLVMContext> context(new llvm::LLVMContext());
std::unique_ptr<llvm::Module> module;
/**
* add each C function to the same module
*/
for (const auto& p : func2Source) {
const std::string& source = p.second;
IntrusiveRefCntPtr<DiagnosticOptions> diagOpts = new DiagnosticOptions();
TextDiagnosticPrinter *diagClient = new TextDiagnosticPrinter(llvm::errs(), &*diagOpts); // will be owned by diags
IntrusiveRefCntPtr<DiagnosticIDs> diagID(new DiagnosticIDs());
IntrusiveRefCntPtr<DiagnosticsEngine> diags(new DiagnosticsEngine(diagID, &*diagOpts, diagClient));
ArrayRef<const char *> args(clangArgv + 1, // skip program name
clangArgc - 1);
std::unique_ptr<CompilerInvocation> invocation(createInvocationFromCommandLine(args, diags));
if (invocation.get() == nullptr) {
std::cerr << "Failed to create compiler invocation" << std::endl;
exit(1);
}
CompilerInvocation::setLangDefaults(*invocation->getLangOpts(), IK_C,
LangStandard::lang_unspecified);
invocation->getFrontendOpts().DisableFree = false; // make sure we free memory (by default it does not)
// Create a compiler instance to handle the actual work.
CompilerInstance compiler;
compiler.setInvocation(invocation.release());
// Create the compilers actual diagnostics engine.
compiler.createDiagnostics(); //compiler.createDiagnostics(argc, const_cast<char**> (argv));
if (!compiler.hasDiagnostics()) {
std::cerr << "No diagnostics" << std::endl;
exit(1);
}
// Create memory buffer with source text
std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(source, "SIMPLE_BUFFER");
if (buffer.get() == nullptr) {
std::cerr << "Failed to create memory buffer" << std::endl;
exit(1);
}
// Remap auxiliary name "string-input" to memory buffer
PreprocessorOptions& po = compiler.getInvocation().getPreprocessorOpts();
po.addRemappedFile("string-input", buffer.release());
// Create and execute the frontend to generate an LLVM bitcode module.
clang::EmitLLVMOnlyAction action(context.get());
if (!compiler.ExecuteAction(action)) {
std::cerr << "Failed to emit LLVM bitcode" << std::endl;
exit(1);
}
std::unique_ptr<llvm::Module> module1 = action.takeModule();
if (module1.get() == nullptr) {
std::cerr << "No module" << std::endl;
exit(1);
}
if (linker.get() == nullptr) {
module.reset(module1.release());
linker.reset(new llvm::Linker(module.get()));
} else {
if (linker->linkInModule(module1.release())) {
std::cerr << "LLVM failed to link module" << std::endl;
exit(1);
}
}
}
llvm::InitializeNativeTarget();
llvm::Module* m = module.get();
std::string errStr;
std::unique_ptr<llvm::ExecutionEngine> executionEngine(EngineBuilder(std::move(module))
.setErrorStr(&errStr)
.setEngineKind(EngineKind::JIT)
.setMCJITMemoryManager(std::unique_ptr<SectionMemoryManager>(new SectionMemoryManager()))
.setVerifyModules(true)
.create());
if (!executionEngine.get()) {
std::cerr << "Could not create ExecutionEngine: " + errStr << std::endl;
exit(1);
}
executionEngine->finalizeObject();
/**
* Lets try to use each function
*/
for (const auto& p : func2Source) {
const std::string& funcName = p.first;
llvm::Function* func = m->getFunction(funcName);
if (func == nullptr) {
std::cerr << "Unable to find function '" << funcName << "' in LLVM module" << std::endl;
exit(1);
}
// Validate the generated code, checking for consistency.
llvm::raw_os_ostream os(std::cerr);
bool failed = llvm::verifyFunction(*func, &os);
if (failed) {
std::cerr << "Failed to verify function '" << funcName << "' in LLVM module" << std::endl;
exit(1);
}
#if 1
func->dump(); // Dump the function for exposition purposes.
// JIT the function, returning a function pointer.
void *fPtr = executionEngine->getPointerToFunction(func); ///// BAD function pointer!!!!
// Cast it to the right type (takes no arguments, returns a double) so we
// can call it as a native function.
int (*funcPtr)();
*(int **) (&funcPtr) = *(int **) fPtr;
int v = (*funcPtr)();
std::cout << "return: " << v << std::endl;
#else // THIS DOES NOT WORK EITHER:
// JIT the function, returning a function pointer.
uint64_t fPtr = executionEngine->getFunctionAddress(funcName); ///// BAD function pointer!!!!
if (fPtr == 0) {
std::cerr << "Unable to find function '" << funcName << "' in LLVM module" << std::endl;
exit(1);
}
int (*funcPtr)();
*(int **) (&funcPtr) = *(int **) fPtr;
int v = (*funcPtr)();
std::cout << "return: " << v << std::endl;
#endif
}
}
Can anyone help me pin-point the problem?
(I'm running this in linux-ubuntu 15.04)
This assignment is incredibly messed up:
*(int **) (&funcPtr) = *(int **) fPtr;
Not only does it violate strict-aliasing to write an int* and then use it as a function pointer on the next line, but a data pointer is often not large enough to hold an entire code pointer.
The safe approach is either
memcpy(funcPtr, fPtr, sizeof funcPtr);
or
funcPtr = reinterpret_cast<decltype(funcPtr)>(fPtr);

How to pass a string to the GPU and get it back from kernel to display it? (C++ OpenCL)

I would like to pass a string to my GPU and get it back from the GPU to print it. It is for understanding purposes - I know, the idea sounds senseless.
I tried:
OpenCL:
__kernel void same_in_same_out_char(__global uchar * out, __constant uchar * in){
for (unsigned int ui=0; ui<3; ui++) out[ui]=in[ui];
}
C++:
#define __CL_ENABLE_EXCEPTIONS
#include <fstream>
#include <iostream>
#include <iterator>
#include <CL/cl.hpp>
#include <CL/opencl.h>
using namespace std;
int main () {
vector<cl::Platform> platforms;
vector<cl::Device> devices;
vector<cl::Kernel> kernels;
try {
// create platform
cl::Platform::get(&platforms);
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
// create context
cl::Context context(devices);
// create command queue
cl::CommandQueue queue(context, devices[0]);
// load opencl source
ifstream cl_file("inout.cl");
string cl_string(istreambuf_iterator<char>(cl_file), (istreambuf_iterator<char>()));
cl::Program::Sources source(1, make_pair(cl_string.c_str(),
cl_string.length() + 1));
// create program
cl::Program program(context, source);
// compile opencl source
program.build(devices);
// load named kernel from opencl source
cl::Kernel kernel(program, "same_in_same_out_char");
// create a message to send to kernel
const char pwd[] = "MAX";
cout << "char pwd[] : " << pwd << endl;
cl_uchar * password = (cl_uchar*) &pwd;
int bufferA_size = 3; // array size is 3
int bufferC_size = 3; // array size is 3
cout << " -- OpenCL -- " << endl;
// allocate device buffer to hold message
cl::Buffer bufferA(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uchar) * bufferA_size, password);
cl::Buffer bufferC(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * bufferC_size);
// set message as kernel argument
kernel.setArg(0, bufferC);
kernel.setArg(1, bufferA);
// execute kernel
queue.enqueueTask(kernel);
// wait for completion
queue.finish();
// ----------------------
cl_uint out_global[bufferC_size];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, bufferC_size*sizeof(cl_uchar), &out_global);
cout << "Output \t\t:" << *out_global << endl << "Output[1..n] \t:";
for (unsigned int i=0; i<bufferC_size; i ++) cout << out_global[i] << " " ;
cout << endl;
} catch (cl::Error e) {
cout << endl << e.what() << " : " << e.err() << endl;
}
return 0;
}
But I failed. Output is
Output :5783885
Output[1..n] :5783885 0 26
But not
MAX
or
77 65 88
(for M A X).
Regard,
Marcus
It is if fact giving you the answer you expect, but you are putting it in the wrong data type.
What you are getting is the single integer 5783885, which is 0x0058414D (in hexadecimal.) You are on a little-endian platform, so these bytes are arranged in memory from low-order to high-order, i.e. if you look at the memory, the bytes will be (still in hex):
4D, 41, 58, 00, ...
These, displayed in decimal will be:
77, 65, 88, 0, ...
In other words, exactly what you expect.
Your problem (one of your problems, at least) is that you've declared out_global as an array of cl_uints, instead of cl_uchar or cl_char or something.
So change the following line, and you'll probably be fine.
cl_uint out_global[bufferC_size];