Using extern on Halide with GPU - c++

I try to use extern function in Halide. In my context, I want to do it on GPU.
I compile in AOT compilation with opencl statement.
Of course, opencl can still use CPU, so I use this:
halide_set_ocl_device_type("gpu");
For now, everything is schedule at compute_root().
First question, if I use compute_root() and OpenCL gpu, did my process will be compute on the device with some CopyHtoD and DtoH? (Or it will be on Host buffer)
Second question, more related to the extern functions. We use some extern call because some of our algorithm is not in Halide.
Extern call:
foo.define_extern("cool_foo", args, Float(32), 4);
Extern retrieve:
extern "C" int cool_foo(buffer_t * in, int w, int h, int z, buffer_t * out){ .. }
But, in the cool_foo function, my buffer_t are load only in host memory. The dev address is 0 (default).
If I try to copy the memory before the algorithm:
halide_copy_to_dev(NULL, &in);
It does nothing.
If I make available only the device memory:
in.host = NULL;
My host pointer are null, but the device address is still 0.
(dev_dirty is true on my case and host_dirty is false)
Any idea?
EDIT (To answer dsharlet)
Here's the structure of my code:
Parse data correctly on CPU. --> Sent the buffer on the GPU (Using halide_copy_to_dev...) --> Enter in Halide structure, read parameter and Add a boundary condition --> Go in my extern function -->...
I don't have a valid buffer_t in my extern function.
I schedule everything in compute_root(), but use HL_TARGET=host-opencl and set ocl to gpu.
Before entering in Halide, I can read my device address and it's ok.
Here's my code:
Before Halide, everything was CPU stuff(The pointer) and we transfert it to GPU
buffer_t k = { 0, (uint8_t *) k_full, {w_k, h_k, num_patch_x * num_patch_y * 3}, {1, w_k, w_k * h_k}, {0}, sizeof(float), };
#if defined( USEGPU )
// Transfer into GPU
halide_copy_to_dev(NULL, &k);
k.host_dirty = false;
k.dev_dirty = true;
//k.host = NULL; // It's k_full
#endif
halide_func(&k)
Inside Halide:
ImageParam ...
Func process;
process = halide_sub_func(k, width, height, k.channels());
process.compute_root();
...
Func halide_sub_func(ImageParam k, Expr width, Expr height, Expr patches)
{
Func kBounded("kBounded"), kShifted("kShifted"), khat("khat"), khat_tuple("khat_tuple");
kBounded = repeat_image(constant_exterior(k, 0.0f), 0, width, 0, height, 0, patches);
kShifted(x, y, pi) = kBounded(x + k.width() / 2, y + k.height() / 2, pi);
khat = extern_func(kShifted, width, height, patches);
khat_tuple(x, y, pi) = Tuple(khat(0, x, y, pi), khat(1, x, y, pi));
kShifted.compute_root();
khat.compute_root();
return khat_tuple;
}
Outside Halide(Extern function):
inline ....
{
//The buffer_t.dev and .host are 0 and null. I expect a null from the host, but the dev..
}

I find the solution for my problem.
I post the answer in code just here. (Since I did a little offline test, the variable name doesn't match)
Inside Halide: (Halide_func.cpp)
#include <Halide.h>
using namespace Halide;
using namespace Halide::BoundaryConditions;
Func thirdPartyFunction(ImageParam f);
Func fourthPartyFunction(ImageParam f);
Var x, y;
int main(int argc, char **argv) {
// Input:
ImageParam f( Float( 32 ), 2, "f" );
printf(" Argument: %d\n",argc);
int test = atoi(argv[1]);
if (test == 1) {
Func f1;
f1(x, y) = f(x, y) + 1.0f;
f1.gpu_tile(x, 256);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
f1.compile_to_file("halide_func", args);
} else if (test == 2) {
Func fOutput("fOutput");
Func fBounded("fBounded");
fBounded = repeat_image(f, 0, f.width(), 0, f.height());
fOutput(x, y) = fBounded(x-1, y) + 1.0f;
fOutput.gpu_tile(x, 256);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
fOutput.compile_to_file("halide_func", args);
} else if (test == 3) {
Func h("hOut");
h = thirdPartyFunction(f);
h.gpu_tile(x, 256);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
h.compile_to_file("halide_func", args);
} else {
Func h("hOut");
h = fourthPartyFunction(f);
std::vector<Argument> args( 1 );
args[ 0 ] = f;
h.compile_to_file("halide_func", args);
}
}
Func thirdPartyFunction(ImageParam f) {
Func g("g");
Func fBounded("fBounded");
Func h("h");
//Boundary
fBounded = repeat_image(f, 0, f.width(), 0, f.height());
g(x, y) = fBounded(x-1, y) + 1.0f;
h(x, y) = g(x, y) - 1.0f;
// Need to be comment out if you want to use GPU schedule.
//g.compute_root(); //At least one stage schedule alone
//h.compute_root();
return h;
}
Func fourthPartyFunction(ImageParam f) {
Func fBounded("fBounded");
Func g("g");
Func h("h");
//Boundary
fBounded = repeat_image(f, 0, f.width(), 0, f.height());
// Preprocess
g(x, y) = fBounded(x-1, y) + 1.0f;
g.compute_root();
g.gpu_tile(x, y, 256, 1);
// Extern
std::vector < ExternFuncArgument > args = { g, f.width(), f.height() };
h.define_extern("extern_func", args, Int(16), 3);
h.compute_root();
return h;
}
The external function: (external_func.h)
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cinttypes>
#include <cstring>
#include <fstream>
#include <map>
#include <vector>
#include <complex>
#include <chrono>
#include <iostream>
#include <clFFT.h> // All OpenCL I need are include.
using namespace std;
// Useful stuff.
void completeDetails2D(buffer_t buffer) {
// Read all elements:
std::cout << "Buffer information:" << std::endl;
std::cout << "Extent: " << buffer.extent[0] << ", " << buffer.extent[1] << std::endl;
std::cout << "Stride: " << buffer.stride[0] << ", " << buffer.stride[1] << std::endl;
std::cout << "Min: " << buffer.min[0] << ", " << buffer.min[1] << std::endl;
std::cout << "Elem size: " << buffer.elem_size << std::endl;
std::cout << "Host dirty: " << buffer.host_dirty << ", Dev dirty: " << buffer.dev_dirty << std::endl;
printf("Host pointer: %p, Dev pointer: %" PRIu64 "\n\n\n", buffer.host, buffer.dev);
}
extern cl_context _ZN6Halide7Runtime8Internal11weak_cl_ctxE;
extern cl_command_queue _ZN6Halide7Runtime8Internal9weak_cl_qE;
extern "C" int extern_func(buffer_t * in, int width, int height, buffer_t * out)
{
printf("In extern\n");
completeDetails2D(*in);
printf("Out extern\n");
completeDetails2D(*out);
if(in->dev == 0) {
// Boundary stuff
in->min[0] = 0;
in->min[1] = 0;
in->extent[0] = width;
in->extent[1] = height;
return 0;
}
// Super awesome stuff on GPU
// ...
cl_context & ctx = _ZN6Halide7Runtime8Internal11weak_cl_ctxE; // Found by zougloub
cl_command_queue & queue = _ZN6Halide7Runtime8Internal9weak_cl_qE; // Same
printf("ctx: %p\n", ctx);
printf("queue: %p\n", queue);
cl_mem buffer_in;
buffer_in = (cl_mem) in->dev;
cl_mem buffer_out;
buffer_out = (cl_mem) out->dev;
// Just copying data from one buffer to another
int err = clEnqueueCopyBuffer(queue, buffer_in, buffer_out, 0, 0, 256*256*4, 0, NULL, NULL);
printf("copy: %d\n", err);
err = clFinish(queue);
printf("finish: %d\n\n", err);
return 0;
}
Finally, the non-Halide stuff: (Halide_test.cpp)
#include <halide_func.h>
#include <iostream>
#include <cinttypes>
#include <external_func.h>
// Extern function available inside the .o generated.
#include "HalideRuntime.h"
int main(int argc, char **argv) {
// Init the kernel in GPU
halide_set_ocl_device_type("gpu");
// Create a buffer
int width = 256;
int height = 256;
float * bufferHostIn = (float*) malloc(sizeof(float) * width * height);
float * bufferHostOut = (float*) malloc(sizeof(float) * width * height);
for( int j = 0; j < height; ++j) {
for( int i = 0; i < width; ++i) {
bufferHostIn[i + j * width] = i+j;
}
}
buffer_t bufferHalideIn = {0, (uint8_t *) bufferHostIn, {width, height}, {1, width, width * height}, {0, 0}, sizeof(float), true, false};
buffer_t bufferHalideOut = {0, (uint8_t *) bufferHostOut, {width, height}, {1, width, width * height}, {0, 0}, sizeof(float), true, false};
printf("IN\n");
completeDetails2D(bufferHalideIn);
printf("Data (host): ");
for(int i = 0; i < 10; ++ i) {
printf(" %f, ", bufferHostIn[i]);
}
printf("\n");
printf("OUT\n");
completeDetails2D(bufferHalideOut);
// Send to GPU
halide_copy_to_dev(NULL, &bufferHalideIn);
halide_copy_to_dev(NULL, &bufferHalideOut);
bufferHalideIn.host_dirty = false;
bufferHalideIn.dev_dirty = true;
bufferHalideOut.host_dirty = false;
bufferHalideOut.dev_dirty = true;
// TRICKS Halide to force the use of device.
bufferHalideIn.host = NULL;
bufferHalideOut.host = NULL;
printf("IN After device\n");
completeDetails2D(bufferHalideIn);
// Halide function
halide_func(&bufferHalideIn, &bufferHalideOut);
// Get back to HOST
bufferHalideIn.host = (uint8_t*)bufferHostIn;
bufferHalideOut.host = (uint8_t*)bufferHostOut;
halide_copy_to_host(NULL, &bufferHalideOut);
halide_copy_to_host(NULL, &bufferHalideIn);
// Validation
printf("\nOUT\n");
completeDetails2D(bufferHalideOut);
printf("Data (host): ");
for(int i = 0; i < 10; ++ i) {
printf(" %f, ", bufferHostOut[i]);
}
printf("\n");
// Free all
free(bufferHostIn);
free(bufferHostOut);
}
You can compile the halide_func with the test 4 to use all the Extern functionnality.
Here's some of the conclusion I have. (Thanks to Zalman and zougloub)
Compute_root don't call the device if you use it alone.
We need gpu() of gpu_tile() in the code to call GPU routine. (BTW, you need to put all your variable inside)
gpu_tile les than your item will crash your stuff.
BoundaryCondition works well in GPU.
Before calling extern function, the Func that goes as a input need to be:
f.compute_root(); f.gpu_tile(x,y,...,...); The compute_root in the middle stage is not implicit.
If the dev address is 0, it's normal, we resend the dimension and the extern will be called again.
Last stage as a compute_root() implicit.

Are you aware of the bounds inference protocol for external array functions? This takes place when the host pointer of any buffer is NULL. (Briefly, in this case, you need to fill in the extent fields of the buffer_t structures that have NULL host pointers and do nothing else.) If you have already taken care of that, then ignore the above.
If you've tested that the host pointers are non-NULL for all buffers, then calling halide_copy_to_dev should work. You may need to explicitly set host_dirty to true beforehand to get the copy part to happen, depending where the buffer came from. (I would hope Halide gets this right and it is already set if the buffer came from a previous pipeline stage on the CPU. But if the buffer came from something outside Halide, the dirty bits are probably false from initialization. It seems halide_dev_malloc should set dev_dirty if it allocates device memory, and currently it does not.)
I would expect the dev field to be populated after a call to halide_copy_to_dev as the first thing it does is call halide_dev_malloc. You can try calling halide_dev_malloc explicitly yourself, setting host_dirty and then calling halide_copy_to_dev.
Is the previous stage on the host or on the GPU? If it is on the GPU, I'd expect the input buffer to be on the GPU as well.
This API needs work. I am in the middle of a first refactoring of somethings that will help, but ultimately it will require changing the buffer_t structure. It is possible to get most things to work, but it requires a modifying the host_dirty and dev_dirty bits as well as calling the halide_dev* APIs in just the right way. Thank you for your patience.

Related

Modifying a C++ array in main() from Lua without extra allocation

I am sketching a small C++ program that will pass arrays to Lua and have them modified there, where I intend to have a lua script read in the program so I can modify it without needing to recompile the program
My first obstacle is to ensure Lua is able to modify arrays already allocated instead of having them allocated again in the Lua space. The data will be float and the size will be really large, but I am starting small for the moment.
To simplify this interface I tried LuaBridge 2.6, but it doesn't provide the expected result. Below is a fully "working" program.
#include <iostream>
#include <cstdint>
#include <cstring>
#include <vector>
#include <lua5.3/lua.hpp>
#include <LuaBridge/LuaBridge.h>
int main(void)
{
const uint32_t LENGTH = 512 * 256;
std::vector <float> input(LENGTH),
output(LENGTH);
memset(output.data(), 0, LENGTH * sizeof(float)); // Zero the output
for(uint32_t i = 0; i < LENGTH; i++) // Populate input
input[i] = (float)i + 0.5f;
lua_State *luastate = luaL_newstate();
luabridge::push(luastate, input.data()); // Supposedly passing a pointer to the first element of input, according to LuaBridge manual chap 3-3.1
luabridge::push(luastate, output.data()); // Same for output
luaL_dostring(luastate, "output[10] = input[256]"); // Expecting to assign this value in the C++ arrays, not in the Lua space
lua_getglobal(luastate, "output[10]"); // Find this assigned value in the Lua stack
lua_Number val = lua_tonumber(luastate, 1); // Retrieving this value from Lua to C++
std::cout << input[256] << ", " << output[10] << ", " << val << std::endl; // The values of val and in output[10] don't match
lua_close(luastate);
return 0;
}
Notice that nothing matches. What is going to output[10] in Lua is not the value of input[256] in the C++ space, but input[0].
The C++ output array is not updated from within Lua, cout shows that it remains as we initialized (0).
To confirm that, we pushed this value of output[10] to the stack, which is not input[256] in C++, and retrieved from C++.
Can you guys correct me or point me to where I should be going to achieve this?
======= UPDATE 08/11/2020 =======
To clarify what the program is doing (or supposed to do), after reading Robert's and Joseph's considerations, I post below an updated version of both the C++ part and the lua script called by it. Notice I abandoned LuaBridge since I didn't succeed in the first attempt:
C++:
#include <iostream>
#include <cstdint>
#include <cstring>
#include <vector>
#include <luajit-2.0/lua.hpp> // LuaJIT 2.0.4 from Ubuntu 16.04
int main(void)
{
const uint32_t LENGTH = 256 * 512;
std::vector <float> input(LENGTH),
output(LENGTH);
memset(output.data(), 0, LENGTH * sizeof(float));
for(uint32_t i = 0; i < LENGTH; i++)
input[i] = (float)i + 0.5f;
lua_State *luastate = luaL_newstate();
luaL_openlibs(luastate);
// Here I have to pass &input[0], &output[0] and LENGTH
// to Lua, which in turn will pass to whatever functions
// are being called from a .so lib opened in Lua-side
luaL_dofile(luastate, "my_script.lua");
lua_close(luastate);
return 0;
}
The Lua script looks like this:
local ffi = require("ffi")
local mylib = ffi.load("/path_to_lib/mylib.so")
-- Here I import and call any fuctions needed from mylib.so
-- without needing to recompile anything, just change this script
-- At this point the script has to know &input[0], &output[0] and LENGTH
ffi.cdef[[int func1(const float *in, float *out, const uint32_t LEN);]]
ffi.cdef[[int func2(const float *in, float *out, const uint32_t LEN);]]
ffi.cdef[[int funcX(const float *in, float *out, const uint32_t LEN);]]
if(mylib.func1(input, output, LENGTH) == 0) then
print("Func1 ran successfuly.")
else
print("Func1 failed.")
end
I am sketching a small C++ program that will pass arrays to Lua
The data will be float and the size will be really large,
My suggestion:
Keep the buffer on the C side (as a global variable for example)
Expose a C-function to LUA GetTableValue(Index)
Expose a C-function to Lua SetTableValue(Index, Value)
It should be something like this:
static int LUA_GetTableValue (lua_State *LuaState)
{
float Value;
/* lua_gettop returns the number of arguments */
if ((lua_gettop(LuaState) == 1) && (lua_isinteger(LuaState, -1)))
{
/* Get event string to execute (first parameter) */
Offset = lua_tointeger(LuaState, -1);
/* Get table value */
Value = LUA_FloatTable[Offset];
/* Push result to the stack */
lua_pushnumber(Value);
}
else
{
lua_pushnil(LuaState);
}
/* return 1 value */
return 1;
}
And you also need to register the function:
lua_register(LuaState, "GetTableValue", LUA_GetTableValue);
I let you write the SetTableValue but it should be very close.
Doing so, the buffer is on C side and can be accessed from Lua with dedicated functions.
I recommend you create a userdata that exposes the arrays via __index and __newindex, something like this (written as a C and C++ polyglot like Lua itself):
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <lua5.3/lua.h>
#include <lua5.3/lauxlib.h>
#ifdef __cplusplus
}
#endif
struct MyNumbers {
lua_Number *arr;
lua_Integer len;
};
int MyNumbers_index(lua_State *L) {
struct MyNumbers *t = (struct MyNumbers *)luaL_checkudata(L, 1, "MyNumbers");
lua_Integer k = luaL_checkinteger(L, 2);
if(k >= 0 && k < t->len) {
lua_pushnumber(L, t->arr[k]);
} else {
lua_pushnil(L);
}
return 1;
}
int MyNumbers_newindex(lua_State *L) {
struct MyNumbers *t = (struct MyNumbers *)luaL_checkudata(L, 1, "MyNumbers");
lua_Integer k = luaL_checkinteger(L, 2);
if(k >= 0 && k < t->len) {
t->arr[k] = luaL_checknumber(L, 3);
return 0;
} else {
return luaL_argerror(L, 2,
lua_pushfstring(L, "index %d out of range", k));
}
}
struct MyNumbers *MyNumbers_new(lua_State *L, lua_Number *arr, lua_Integer len) {
struct MyNumbers *var = (struct MyNumbers *)lua_newuserdata(L, sizeof *var);
var->arr = arr;
var->len = len;
luaL_setmetatable(L, "MyNumbers");
return var;
}
int main(void) {
const lua_Integer LENGTH = 512 * 256;
lua_Number input[LENGTH], output[LENGTH];
memset(output, 0, sizeof output);
for(lua_Integer i = 0; i < LENGTH; ++i)
input[i] = i + 0.5f;
lua_State *L = luaL_newstate();
luaL_newmetatable(L, "MyNumbers");
lua_pushcfunction(L, MyNumbers_index);
lua_setfield(L, -2, "__index");
lua_pushcfunction(L, MyNumbers_newindex);
lua_setfield(L, -2, "__newindex");
/* exercise for the reader: implement __len and __pairs too, and maybe shift the indices so they're 1-based to Lua */
lua_pop(L, 1);
MyNumbers_new(L, input, LENGTH);
lua_setglobal(L, "input");
MyNumbers_new(L, output, LENGTH);
lua_setglobal(L, "output");
luaL_dostring(L, "output[10] = input[256]");
lua_getglobal(L, "output");
lua_geti(L, -1, 10);
lua_Number val = lua_tonumber(L, -1);
printf("%f, %f, %f\n", input[256], output[10], val);
lua_close(L);
}
With this approach, there is no copy of any data in Lua, and your own MyNumbers_ functions control how all access to them is done.
If you want to be able to use the arrays through LuaJIT's FFI instead of directly manipulating them in Lua, then you can pass their addresses in a light userdata instead, like this:
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <luajit-2.0/lua.h>
#include <luajit-2.0/lualib.h>
#include <luajit-2.0/lauxlib.h>
#ifdef __cplusplus
}
#endif
int main(void) {
const lua_Integer LENGTH = 256 * 512;
lua_Number input[LENGTH], output[LENGTH];
memset(output, 0, sizeof output);
for(lua_Integer i = 0; i < LENGTH; ++i)
input[i] = i + 0.5f;
lua_State *L = luaL_newstate();
luaL_openlibs(L);
lua_pushlightuserdata(L, input);
lua_setglobal(L, "input");
lua_pushlightuserdata(L, output);
lua_setglobal(L, "output");
lua_pushinteger(L, LENGTH);
lua_setglobal(L, "LENGTH");
luaL_dofile(L, "my_script.lua");
lua_close(L);
}

Looking for insight into why I'm getting an access violation here

I'm trying to create a list of points whose z value will be modified by the alpha values of a grayscale image. As the points are being assigned to a list, I keep getting an access violation. I noticed during debugging that the size of the alpha array is suddenly changing in the middle of the for loop, along with my width and height values. I'm new to C++ so I imagine it's an obvious mistake. Here's the relevant code:
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
// image processing libs
#include "vendors/stb_image/stb_image.h"
#include "vendors/stb_image/stb_image_write.h"
#include "Image.h"
#include "Renderer.h"
#include "VertexBufferLayout.h"
// signed normalization function for scaling input value with a known input range to an output range
float snorm(float value, float in_Min, float in_Max)
{
float out_Value = ( ((1.0f - 1.0f) * ((value - in_Min) / (in_Max - in_Min))) + -1.0f );
return out_Value;
}
int main(void)
{
// CONSTANTS
const int SCREEN_WIDTH = 2000;
const int SCREEN_HEIGHT = 2000;
// glsl version
const char* glsl_version = "#version 330";
Image image("res/images/drama_mask_white.jpg");
// loads an image to greyscale (normal) and returns the path to that normal
std::string normal_Path = image.ImgToGrayScale("gray_mask");
image.GetAlphas(normal_Path);
image.setMinMaxAlphas();
const std::vector<float> * lcl_Alphas = &image.alpha_Map;
const int lcl_Width = image.img_Width;
const int lcl_Height = image.img_Height;
const float x_Increment = 2.0f / lcl_Width;
const float y_Increment = 2.0f / lcl_Height;
float positions[] = { 0 };
//unsigned int indices[] = { 0 };
unsigned int row = 0;
unsigned int col = 0;
unsigned int num_Rows = 0;
unsigned int num_Cols = 0;
unsigned int num_Verteces = 0;
unsigned int pos_Count = 0;
for (int i = 0; (unsigned)i < image.alpha_Map.size(); i++)
{
// checks for the end of the row
if (i > 0 && (i % (image.img_Width - 1)) == 0)
{
row++; // if we've reached the end of a row, increment row index
num_Cols = col;
col = 0; // reset column index at end of each row
}
// assign position values starting from bottom left
// X
positions[pos_Count] = -1.0f + (col * x_Increment);
// Y
positions[pos_Count + 1] = -1.0f + (row * y_Increment);
// Z
// ERROR OCCURS HERE
positions[pos_Count + 2] = snorm(image.alpha_Map[i], image.min_Alpha, image.max_Alpha);
pos_Count += 3;
// indices
//indices[i] = i;
col++; // increment column index
num_Verteces++;
}
std::cout << "Num Verteces: " << num_Verteces << std::endl;
std::cout << "Num Positions: " << pos_Count << std::endl;
num_Rows = row;
GLFWwindow* window;
/* Initialize the library */
if (!glfwInit())
return -1;
// create window and context with core profile
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
/* Create a windowed mode window and its OpenGL context */
window = glfwCreateWindow(SCREEN_WIDTH, SCREEN_HEIGHT, "Hello World", NULL, NULL);
if (!window)
{
glfwTerminate();
return -1;
}
/* Make the window's context current */
glfwMakeContextCurrent(window);
glfwSwapInterval(1);
// checks if GLEW intializes correctly
if (glewInit() != GLEW_OK)
std::cout << "ERROR!" << std::endl;
std::cout << glGetString(GL_VERSION) << std::endl;
// enable blending
GLCall(glEnable(GL_BLEND));
// get source alpha, subtract from one to blend alpha at destination
GLCall(glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA));
// init renderer object
Renderer renderer;
GLfloat test_Vertex[] = { SCREEN_WIDTH / 2, SCREEN_HEIGHT / 2, 0.0f };
while (!glfwWindowShouldClose(window)) {
GLCall(glClearColor(0.0, 0.0, 0.0, 1.0));
renderer.Clear();
glEnableClientState(GL_VERTEX_ARRAY);
glVertexPointer(3, GL_FLOAT, 0, test_Vertex);
glDisableClientState(GL_VERTEX_ARRAY);
glDrawArrays(GL_POINTS, 0, 1);
// Swap front and back buffers
glfwSwapBuffers(window);
// Poll for, and process events
glfwPollEvents();
}
glfwTerminate();
}
I see you are a new contributor. I think in the future you can do a better job at 'minimizing' your example code. The general idea: Try to delete lines to have the error happen with little code as possible, maybe you will find a line that makes the difference. This also helps you find mistakes before you have to ask others.
I think a big problem here may be that your array of positions will always be of size 1.
This makes it harder to figure out any other problems in the rest of the code till this is fixed.
Using float positions[] = { 0 }; means the compiler will only reserve enough space for the positions array to contain 1 float value. So writing to positions[0] = 42 is valid, but writing to positions[1] = 42 can already be bad. Maybe you are lucky and your program crashes immediatly. If you are unlucky your program will write to the memory past the end of your array, and if you are extra unlucky this memory contains something important. It could contain something like the size of your vector of alphas or any other data in memory. So the errors it causes can become very unpredictable.
int main() {
char greeting[] = "Hello World!\n";
float positions[] = { 0 };
positions[0] = 42; // GOOD inside array bounds
positions[4] = 42; // BAD outside array bounds destroying other data
std::cout << greeting;
}
Here is an example where I break a hello-world greeting on purpose by writing out of bounds of an array. If you provide information about the IDE or compiler you are using people are able to tell you how to enable or view warnings for such mistakes. For example GCC's -Wall or -Warray-bounds. Or you can Google it yourself by adding "out of bounds warning".
Output of the example
> clang++-7 -pthread -std=c++17 -o main main.cpp
main.cpp:7:3: warning: array index 4 is past the end of the array
(which contains 1 element) [-Warray-bounds]
positions[4] = 42; // BAD outside array bounds destroying other data
^ ~
main.cpp:5:3: note: array 'positions' declared here
float positions[] = { 0 };
^
1 warning generated.
> ./main
Hello Worl
#Rabbid76 and #Peter had the right idea here... I needed to use std::vector because I was initializing the array with only element, but populating it during the for loop. Once I converted to std::vector it worked fine. Now to make this thing actually draw the points...

Simple usage example of gif-h library

I'm attempting to create a minimal usage example for https://github.com/ginsweater/gif-h
But, starting with a vector<uint8_t> of size imageWidth*imageHeight, the second GifWriteFrame call throws an access violation reading location exception
My attempt:
#include <gif.h>
#include "BOBImageConversion.h"
int main(void)
{
// USAGE:
// Create a GifWriter struct. Pass it to GifBegin() to initialize and write the header.
// Pass subsequent frames to GifWriteFrame().
// Finally, call GifEnd() to close the file handle and free memory.
int delay = 100;
auto i1 = BOBImageIO::BOBLoadImage("Camera7.png");
auto i2 = BOBImageIO::BOBLoadImage("Camera18.png");
vector<uint8_t> vi1 = BOBImageConversion::ARGB2RGBAuint8(i1);
vector<uint8_t> vi2 = BOBImageConversion::ARGB2RGBAuint8(i2);
cout << (vi1.size() == i1.Width()*i1.Height()) << endl; // true
cout << (vi2.size() == i2.Width()*i2.Height()) << endl; // true
auto fileName = "gif.gif";
GifWriter g;
GifBegin(&g, fileName, i1.Width(), i1.Height(), delay);
GifWriteFrame(&g, vi1.data(), i1.Width(), i1.Height(), delay);
GifWriteFrame(&g, vi2.data(), i2.Width(), i2.Height(), delay); // Exception thrown: Access violation reading location
GifEnd(&g);
return 0;
}
For the above point the code posted is a minimal example. What's wrong?
This works
#include <vector>
#include <cstdint>
#include <gif.h>
int main()
{
int width = 100;
int height = 200;
std::vector<uint8_t> vi1(width * height * 4, 0);
std::vector<uint8_t> vi2(width * height * 4, 255);
auto fileName = "bwgif.gif";
int delay = 100;
GifWriter g;
GifBegin(&g, fileName, width, height, delay);
GifWriteFrame(&g, vi1.data(), width, height, delay);
GifWriteFrame(&g, vi2.data(), width, height, delay);
GifEnd(&g);
return 0;
}

C++ OpenCL kernel setArg returns CL_INVALID_ARG_VALUE for __read_only image2d_t

I'm taking my first, wobbly steps with OpenCL (the C++ API) and I've run into a problem where I'm at my wit's end and simply have no idea what might be wrong. When setting the image2d_t kernel argument (see distilled code below), the call throws and returns with error code -50 (= CL_INVALID_ARG_VALUE ) and I have absolutely no idea why. I'm using the Intel OpenCL platform and my CPU (OpenCL 2.0) and I've been able to compile and run other sample code without any problems.
#include "stdafx.h"
#include <vector>
#include <string>
#include <iostream>
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include <CL/cl2.hpp>
std::string my_kernel = R"(
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE;
__kernel void simple_copy( __read_only image2d_t input_image
, __write_only image2d_t output_image
)
{
int2 coords;
coords.x = get_global_id(0);
coords.y = get_global_id(1);
float4 value = read_imagef(input_image, sampler, coords);
write_imagef(output_image, coords, value);
})";
int main()
{
try
{
//define source and destination image data
constexpr unsigned channels = 4, width = 2, height = 2;
std::array<std::uint8_t, channels*width*height> source = { 0,0,0,255, 255,0,0,255, 0,255,0,255, 0,0,255,255 };
std::array<std::uint8_t, channels*width*height> destination{}; //value initialization
//initialize OpenCL
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
auto platform = platforms.front();
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
auto device = devices.front();
auto context = cl::Context(device);
auto queue = cl::CommandQueue(context, device);
//create OpenCL image buffers
auto image_format = cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8);
auto input_image = cl::Image2D(context, CL_MEM_READ_ONLY, image_format, width, height);
auto output_image = cl::Image2D(context, CL_MEM_WRITE_ONLY, image_format, width, height);
//transfer source image data to device
auto origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{width, height, 1};
queue.enqueueWriteImage(input_image, CL_TRUE, origin, region, 0, 0, &source[0]);
//compile device code, retrieve kernel, set kernel arguments
auto program = cl::Program(my_kernel, CL_TRUE);
auto kernel = cl::Kernel(program, "simple_copy");
kernel.setArg(0, input_image); //ERROR: throws -50 (= CL_INVALID_ARG_VALUE ) ... why?!
kernel.setArg(1, output_image);
//enqueue copy kernel
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(width, height), cl::NullRange);
//read result back to host
queue.enqueueReadImage(output_image, CL_TRUE, origin, region, 0, 0, &destination[0]);
//output result
for (auto const & val : destination)
{
std::cout << val << " ";
}
std::cout << std::endl;
}
catch (cl::Error & e)
{
std::cout << "caught OpenCL exception - what: " << e.what() << " err: " << e.err() << std::endl;
}
std::string str;
std::getline(std::cin, str);
return 0;
}
Create your program with a context, like so:
auto program = cl::Program(context, my_kernel, CL_TRUE);

C/CUDA: Only every fourth element in CudaArray can be indexed

This is my first post, so I am thrilled to get some new insights and enlarge my knowledge. Currently I am working on a C-project where a binary raw file with 3d-data is loaded, processed in CUDA and saved in a new binary raw file.
This is based on the simpleTexture3D project from CUDA Samples:
This is my cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, cuda
#include <vector_types.h>
#include <driver_functions.h>
#include <cuda_runtime.h>
// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>
typedef unsigned int uint;
typedef unsigned char uchar;
const char *sSDKsample = "simpleTexture3D";
const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
const uint width = 64, height = 64, depth=64;
//const char *volumeFilename = "TestOCT.raw";
//const cudaExtent volumeSize = make_cudaExtent(1024, 512, 512);
//
//const uint width = 1024, height = 512, depth=512;
const dim3 blockSize(8, 8, 8);
const dim3 gridSize(width / blockSize.x, height / blockSize.y, depth / blockSize.z);
uint *d_output = NULL;
int *pArgc = NULL;
char **pArgv = NULL;
extern "C" void cleanup();
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD);
void loadVolumeData(char *exec_path);
// render image using CUDA
void render()
{
// call CUDA kernel
render_kernel(gridSize, blockSize, d_output, width, height, depth);
getLastCudaError("render_kernel failed");
}
void cleanup()
{
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaDeviceReset());
}
// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size)
{
FILE *fp = fopen(filename, "rb");
if (!fp)
{
fprintf(stderr, "Error opening file '%s'\n", filename);
return 0;
}
uchar *data = (uchar *) malloc(size);
size_t read = fread(data, 1, size, fp);
fclose(fp);
printf("Read '%s', %lu bytes\n", filename, read);
return data;
}
// write raw data to disk
int writeRawFile(const char *filename, uchar *data, size_t size)
{
int returnState=0;
// cut file extension from filename
char *a=strdup(filename); //via strdup you dumb a const char to char, you must free it yourself
int len = strlen(a);
a[len-4] = '\0'; //deletes '.raw'
//printf("%s\n",a);
char b[50];
sprintf(b, "_%dx%dx%d_out.raw", width, height, depth);
//char b[]="_out.raw"; //Add suffix out to filename
char buffer[256]; // <- danger, only storage for 256 characters.
strncpy(buffer, a, sizeof(buffer));
strncat(buffer, b, sizeof(buffer));
free(a);
FILE *fp = fopen(buffer, "wb"); //Open or create file for writing as binary, all existing data is cleared
if (!fp)
{
fprintf(stderr, "Error opening or creating file '%s'\n", buffer);
return 0;
}
size_t write = fwrite(data, 1, size, fp);
fclose(fp);
if (write==size)
{
printf("Wrote %lu bytes to '%s'\n", write, buffer);
return 0;
}
else
{
printf("Error writing data to file '%s'\n", buffer);
return 1;
}
}
// General initialization call for CUDA Device
int chooseCudaDevice(int argc, char **argv)
{
int result = 0;
result = findCudaDevice(argc, (const char **)argv);
return result;
}
void runAutoTest(char *exec_path, char *PathToFile)
{
// set path
char *path;
if (PathToFile == NULL)
{
path = sdkFindFilePath(volumeFilename, exec_path);
}
else
{
path = PathToFile;
}
if (path == NULL)
{
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
exit(EXIT_FAILURE);
}
// Allocate output memory
checkCudaErrors(cudaMalloc((void **)&d_output, width*height*depth*sizeof(uchar)));
// zero out the output array with cudaMemset
cudaMemset(d_output, 0, width*height*depth*sizeof(uchar));
// render the volumeData
render_kernel(gridSize, blockSize, d_output, width, height, depth);
checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("render_kernel failed");
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
int wState=writeRawFile(path,h_output,width*height*depth);
checkCudaErrors(cudaFree(d_output));
free(h_output);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
//exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void loadVolumeData(char *exec_path, char *PathToFile)
{
char *path;
// load volume data
if (PathToFile == NULL)
{
path = sdkFindFilePath(volumeFilename, exec_path);
}
else
{
path = PathToFile;
}
if (path == NULL)
{
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
exit(EXIT_FAILURE);
}
size_t size = volumeSize.width*volumeSize.height*volumeSize.depth;
uchar *h_volume = loadRawFile(path, size);
//int wState=writeRawFile(path,h_volume,size);
initCuda(h_volume, volumeSize);
free(h_volume);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
pArgc = &argc;
pArgv = argv;
char *image_file = NULL;
printf("%s Starting...\n\n", sSDKsample);
if (checkCmdLineFlag(argc, (const char **)argv, "file")) //Note cmd line argument is -file "PathToFile/File.raw"
{ // for example -file "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.0\2_Graphics\simpleTexture3D_FanBeamCorr\data\TestOCT_Kopie.raw"
getCmdLineArgumentString(argc, (const char **)argv, "file", &image_file);
}
if (image_file)
{
chooseCudaDevice(argc, argv);
loadVolumeData(argv[0],image_file);
runAutoTest(argv[0],image_file);
}
else
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
chooseCudaDevice(argc, argv);
loadVolumeData(argv[0],NULL);
runAutoTest(argv[0],NULL);
}
printf("I am finished...\n"
"Can I get some ice cream please\n");
exit(EXIT_SUCCESS);
}
And this is my .cu
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h>
#include <helper_math.h>
typedef unsigned int uint;
typedef unsigned char uchar;
texture<uchar, 3, cudaReadModeNormalizedFloat> tex; // 3D texture
cudaArray *d_volumeArray = 0;
__global__ void
d_render(uint *d_output, uint imageW, uint imageH, uint imageD)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;
// float u = x / (float) imageW;
// float v = y / (float) imageH;
//float w = z / (float) imageD;
// // read from 3D texture
// float voxel = tex3D(tex, u, v, w);
uint ps=__umul24(imageW,imageH);
if ((x < imageW) && (y < imageH) && (z < imageD))
{
// write output color
uint i = __umul24(z,ps) +__umul24(y, imageW) + x;
d_output[1] = (uchar) 255;//+0*voxel*255;
}
}
extern "C"
void initCuda(const uchar *h_volume, cudaExtent volumeSize)
{
// create 3D array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
// copy data to 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void *)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&copyParams));
// set texture parameters
tex.normalized = true; // access with normalized texture coordinates
tex.filterMode = cudaFilterModeLinear; // linear interpolation
tex.addressMode[0] = cudaAddressModeBorder; // wrap texture coordinates
tex.addressMode[1] = cudaAddressModeBorder;
tex.addressMode[2] = cudaAddressModeBorder;
// bind array to 3D texture
checkCudaErrors(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}
extern "C"
void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD)
{
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, imageD);
}
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
As you can see, currently, I set all values to zero except the index = 1, which is set to 255. Yet when I now open the image stack in Fiji, I see that the fourth pixel on the first slide is white. If I use index=i instead, I get white vertical lines across the image stack periodically every four columns. Generally spoken, it seems that only every fourth element is beeing indexed in the CudaArray. So I am wondering if there is somekind of error here resulting from sizeof(uchar)=1 and sizeof(uint)=4. There would obviously be the factor 4 :)
I am eager to here from you experts
Cheers Mika
I figured it out by myself. The kernel works with uint* d_output while the copy to the host is written into a uchar* h_output
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
This led to this strange behavior