Cuda Writing to an array on the device doesn't change value - c++

I have a 1D float3 pixel array, for testing I'm trying to set an array value on the device. I don't get an error but when I print the array value it says 0.
This is my device code.
__global__ void addKernel(float3 *pixeld_d[])
{
pixeld_d[threadIdx.x + W *blockIdx.x] = &make_float3(255, 30, 123);
printf("\n Block %d Thread %d Pixeld_d %d",blockIdx.x,threadIdx.x, pixeld_d[threadIdx.x + W * blockIdx.x]->x);
}
My host code:
float3* pixeld = new float3[W*H];
float3** pixeld_d = new float3*[W*H];
status = cudaMallocManaged((void **)&pixeld_d,(W*H)*sizeof(float3));
status = cudaMemcpy(pixeld_d,pixeld, (W*H) * sizeof(float3), cudaMemcpyHostToDevice);
addKernel << <W,H >> > (pixeld_d);
In the console i get results like this:
Block 811 Thread 25 Pixeld_d 0
I expect Pixeld_d to be 255 but it is 0.
Here the full code(All the commented code is commented because i removed somethings from the function call and vs would give me build errors):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <SFML/Graphics.hpp>
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
}
}
#define W 960
#define H 540
int mov;
#define Vector3 float3
//,Sphere sphere,Sphere light
#pragma region MyRegion
__device__ inline double dot(const Vector3& a, const Vector3& b) {
return (a.x*b.x + a.y*b.y + a.z*b.z);
}
__device__ struct Sphere
{
Vector3 c;
float r;
Sphere(Vector3 i, float j) { c = i, r = j; }
Vector3 getNormal(const Vector3& pi) const { return (make_float3(make_float3(pi.x - c.x, pi.y - c.y, pi.z - c.z).x / r, make_float3(pi.x - c.x, pi.y - c.y, pi.z - c.z).y / r, make_float3(pi.x - c.x, pi.y - c.y, pi.z - c.z).z / r)); }
};
__device__ __host__ struct Color
{
int r, g, b;
Color(float a, float e, float t) { r = a, g = e, b = t; }
};
#pragma endregion
__global__ void addKernel(float3 *pixeld_d[])
{
pixeld_d[threadIdx.x + W *blockIdx.x] = &make_float3(255, 30, 123);
printf("\n Block %d Thread %d Pixeld_d %d",blockIdx.x,threadIdx.x, pixeld_d[threadIdx.x + W * blockIdx.x]->x);
return;/*
float3 black = make_float3(0, 0, 0);
float3 red = make_float3(255, 0, 0);
float3 white = make_float3(255, 255, 255);
pixeld_d[threadIdx.y] = &black;
float3 o = make_float3(blockIdx.x, threadIdx.x, 0);
float3 d = make_float3(0, 0, 1);
double t = 20000;
const Vector3 oc = make_float3(o.x - sphere.c.x, o.y - sphere.c.y, o.z - sphere.c.z);
const double b = 2 * dot(oc, d);
const double c = dot(oc, oc) - sphere.r * sphere.r;
double disc = b * b - 4 * c;
if (!disc < 1e-4)
{
disc = sqrt(disc);
const double t0 = -b - disc;
const double t1 = -b + disc;
t = (t0 < t1) ? t0 : t1;
Vector3 pi = make_float3(o.x + make_float3(d.x * t,d.y * t, d.z * t).x, o.y + make_float3(d.x * t, d.y * t, d.z * t).y,o.z + make_float3(d.x * t, d.y * t, d.z * t).z);
Vector3 L = make_float3(light.c.x - pi.x, light.c.y - pi.y, light.c.z - pi.z);
Vector3 N = make_float3(make_float3(pi.x - sphere.c.x, pi.y - sphere.c.y, pi.z - sphere.c.z).x / sphere.r, make_float3(pi.x - sphere.c.x, pi.y - sphere.c.y, pi.z - sphere.c.z).y / sphere.r, make_float3(pi.x - sphere.c.x, pi.y - sphere.c.y, pi.z - sphere.c.z).z / sphere.r);
double mg = sqrt(L.x*L.x + L.y * L.y + L.z * L.z);
float3 Lf = make_float3(L.x / mg, L.y / mg, L.z / mg);
mg = sqrt(N.x*N.x + N.y * N.y + N.z * N.z);
float3 Nf = make_float3(N.x / mg, N.y / mg, N.z / mg);
float dt = dot(Lf,Nf);
int r = (red.x + white.x * dt)*0.5;
int g = (red.y + white.y * dt)*0.5;
int b = (red.z + white.z * dt)*0.5;
if (r < 0)
r = 0;
if (g < 0)
g = 0;
if (b < 0)
b = 0;
pixeld_d[threadIdx.y]->x = r;
pixeld_d[threadIdx.y]->y = g;
pixeld_d[threadIdx.y]->z = b;
}
*/
}
int main()
{
sf::RenderWindow window(sf::VideoMode(W, H), "SFML works!");
sf::Image image;
image.create(W, H, sf::Color::Black);
sf::Texture tex;
sf::Sprite sprite;
while (window.isOpen())
{
Sphere *sphere;
Sphere *light;
cudaMalloc((void **)&sphere, sizeof(Sphere));
cudaMalloc((void **)&light, sizeof(Sphere));
if (sf::Keyboard::isKeyPressed(sf::Keyboard::A))
{
mov -= 3;
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::D))
{
mov += 3;
}
window.clear();
cudaError_t status;
float3* pixeld = new float3[W*H];
float3** pixeld_d = new float3*[W*H];
status = cudaMallocManaged((void **)&pixeld_d,(W*H)*sizeof(float3));
status = cudaMemcpy(pixeld_d,pixeld, (W*H) * sizeof(float3), cudaMemcpyHostToDevice);
addKernel << <W,H >> > (pixeld_d);
std::cout << cudaGetErrorString(status);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaMemcpy(pixeld,pixeld_d,(W*H)*sizeof(float3), cudaMemcpyDeviceToHost);
std::cout << pixeld[399359].x;
cudaFree(pixeld_d);
for (int x = 0; x < W; x++)
{
for (int y = 0; y < H; y++)
{
sf::Color pixel;
pixel.r = pixeld[x*W*y].x;
pixel.g = pixeld[x*W*y].y;
pixel.b = pixeld[x*W*y].z;
image.setPixel(x, y, pixel);
}
}
tex.loadFromImage(image);
sprite.setTexture(tex, true);
window.draw(sprite);
window.display();
}
//,*sphere,*light
return 0;
}
´´´

Your program has undefined behavior. Due to array decay, this
__global__ void addKernel(float3 *pixeld_d[])
is equivalent to
__global__ void addKernel(float3 **pixeld_d)
So you have declared your kernel function to take a pointer to a pointer to a float3 as input argument. I'm speculating here, but I would guess that this is most likely what originally caused you to introduce all the following issues in an attempy to make the compiler shut up and compile the code. What you actually wanted to write is
__global__ void addKernel(float3 *pixeld_d)
i.e., pass your kernel a pointer to an array of float3 into which it should write the result.
On the host side, you have your pixeld_d, which is a pointer to an array of pointers to float3 initialized to point to a dynamically-allocated array of pointers
float3** pixeld_d = new float3*[W*H];
I'm speculating again, but most likely, you actually wanted this to be just a float3*, but the compiler wouldn't allow you to use that as an argument in your kernel call. Right after that, you immediately overwrite that pointer with the result of a device memory allocation, leaking the previously allocated host memory in the process:
status = cudaMallocManaged((void **)&pixeld_d,(W*H)*sizeof(float3));
Note that the types don't match here. You allocate a buffer for an array of float3 (presumably because that's what you actually wanted) rather than an array of float3*, which is what the types you're using at this point would mandate. &pixel_d is actually a float3***. So the compiler would have caught your mistake right there, but you forced the compiler to shut up with a C-style cast. This is the first place where you invoke undefined behavior. Unfortunately, this kind of error will typically not result in a crash and your program will just continue to behave as expected.
You then go ahead and launch your kernel, which performs the following operation:
pixeld_d[threadIdx.x + W *blockIdx.x] = &make_float3(255, 30, 123);
Here, you're attempting to assign the address of a temporary object (the result of make_float3()) to each element of your float3 array. I'm not sure how you managed to compile this code as it's not legal C++ and any C++ compiler (nvcc included) should refuse to compile it. Even if you did somehow manage to compile this: These temporary objects will automatically be destroyed at the end of this line and the pointers you got there wouldn't point to a valid object anymore. I'm speculating again, but I would assume that this was also just done in an attempt to make the compiler shut up due to the mismatching types. pixeld_d[i] is actually a float3* rather than a float3 because the type of pointer you're using here doesn't match the type of buffer you're actually trying to use.
The morale of the story: Don't just make arbitrary changes to your code until the compiler shuts up. Try to understand why it's refusing to compile code. Usually, the reasons are that one is trying to do something that doesn't make sense. Change the code only once you understood what the problem was and how to fix it…and don't use C-style casts in C++…

I had to remove * in __global__ void addKernel(float3 *pixeld_d[]) and remove the & in front of make_float3

Related

Differences between NVCC and NVRTC on compilation to PTX

Summary
I'm porting a simple raytracing application based on the Scratchapixel version to a bunch of GPU libraries. I sucessfully ported it to CUDA using the runtime API and the driver API, but It throws a Segmentation fault (core dumped) when I try to use the PTX compiled at runtime with NVRTC.
If I uncomment the #include <math.h> directive at the beginning of the kernel file (see below), it still works using NVCC (the generated PTX is exactly the same) but fails at compilation using NVRTC.
I want to know how can I make NVRTC behave just like NVCC (is it even possible?), or at least to understand the reason behind this issues.
Detailed description
File kernel.cu (Kernel source):
//#include <math.h>
#define MAX_RAY_DEPTH 5
template<typename T>
class Vec3
{
public:
T x, y, z;
__device__ Vec3() : x(T(0)), y(T(0)), z(T(0)) {}
__device__ Vec3(T xx) : x(xx), y(xx), z(xx) {}
__device__ Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {}
__device__ Vec3& normalize()
{
T nor2 = length2();
if (nor2 > 0) {
T invNor = 1 / sqrt(nor2);
x *= invNor, y *= invNor, z *= invNor;
}
return *this;
}
__device__ Vec3<T> operator * (const T &f) const { return Vec3<T>(x * f, y * f, z * f); }
__device__ Vec3<T> operator * (const Vec3<T> &v) const { return Vec3<T>(x * v.x, y * v.y, z * v.z); }
__device__ T dot(const Vec3<T> &v) const { return x * v.x + y * v.y + z * v.z; }
__device__ Vec3<T> operator - (const Vec3<T> &v) const { return Vec3<T>(x - v.x, y - v.y, z - v.z); }
__device__ Vec3<T> operator + (const Vec3<T> &v) const { return Vec3<T>(x + v.x, y + v.y, z + v.z); }
__device__ Vec3<T>& operator += (const Vec3<T> &v) { x += v.x, y += v.y, z += v.z; return *this; }
__device__ Vec3<T>& operator *= (const Vec3<T> &v) { x *= v.x, y *= v.y, z *= v.z; return *this; }
__device__ Vec3<T> operator - () const { return Vec3<T>(-x, -y, -z); }
__device__ T length2() const { return x * x + y * y + z * z; }
__device__ T length() const { return sqrt(length2()); }
};
typedef Vec3<float> Vec3f;
typedef Vec3<bool> Vec3b;
class Sphere
{
public:
const char* id;
Vec3f center; /// position of the sphere
float radius, radius2; /// sphere radius and radius^2
Vec3f surfaceColor, emissionColor; /// surface color and emission (light)
float transparency, reflection; /// surface transparency and reflectivity
int animation_frame;
Vec3b animation_position_rand;
Vec3f animation_position;
Sphere(
const char* id,
const Vec3f &c,
const float &r,
const Vec3f &sc,
const float &refl = 0,
const float &transp = 0,
const Vec3f &ec = 0) :
id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc),
emissionColor(ec), transparency(transp), reflection(refl)
{
animation_frame = 0;
}
//[comment]
// Compute a ray-sphere intersection using the geometric solution
//[/comment]
__device__ bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const
{
Vec3f l = center - rayorig;
float tca = l.dot(raydir);
if (tca < 0) return false;
float d2 = l.dot(l) - tca * tca;
if (d2 > radius2) return false;
float thc = sqrt(radius2 - d2);
t0 = tca - thc;
t1 = tca + thc;
return true;
}
};
__device__ float mix(const float &a, const float &b, const float &mixval)
{
return b * mixval + a * (1 - mixval);
}
__device__ Vec3f trace(
const Vec3f &rayorig,
const Vec3f &raydir,
const Sphere *spheres,
const unsigned int spheres_size,
const int &depth)
{
float tnear = INFINITY;
const Sphere* sphere = NULL;
// find intersection of this ray with the sphere in the scene
for (unsigned i = 0; i < spheres_size; ++i) {
float t0 = INFINITY, t1 = INFINITY;
if (spheres[i].intersect(rayorig, raydir, t0, t1)) {
if (t0 < 0) t0 = t1;
if (t0 < tnear) {
tnear = t0;
sphere = &spheres[i];
}
}
}
// if there's no intersection return black or background color
if (!sphere) return Vec3f(2);
Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray
Vec3f phit = rayorig + raydir * tnear; // point of intersection
Vec3f nhit = phit - sphere->center; // normal at the intersection point
nhit.normalize(); // normalize normal direction
// If the normal and the view direction are not opposite to each other
// reverse the normal direction. That also means we are inside the sphere so set
// the inside bool to true. Finally reverse the sign of IdotN which we want
// positive.
float bias = 1e-4; // add some bias to the point from which we will be tracing
bool inside = false;
if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true;
if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < MAX_RAY_DEPTH) {
float facingratio = -raydir.dot(nhit);
// change the mix value to tweak the effect
float fresneleffect = mix(pow(1 - facingratio, 3), 1, 0.1);
// compute reflection direction (not need to normalize because all vectors
// are already normalized)
Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit);
refldir.normalize();
Vec3f reflection = trace(phit + nhit * bias, refldir, spheres, spheres_size, depth + 1);
Vec3f refraction = 0;
// if the sphere is also transparent compute refraction ray (transmission)
if (sphere->transparency) {
float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface?
float cosi = -nhit.dot(raydir);
float k = 1 - eta * eta * (1 - cosi * cosi);
Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k));
refrdir.normalize();
refraction = trace(phit - nhit * bias, refrdir, spheres, spheres_size, depth + 1);
}
// the result is a mix of reflection and refraction (if the sphere is transparent)
surfaceColor = (
reflection * fresneleffect +
refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor;
}
else {
// it's a diffuse object, no need to raytrace any further
for (unsigned i = 0; i < spheres_size; ++i) {
if (spheres[i].emissionColor.x > 0) {
// this is a light
Vec3f transmission = 1;
Vec3f lightDirection = spheres[i].center - phit;
lightDirection.normalize();
for (unsigned j = 0; j < spheres_size; ++j) {
if (i != j) {
float t0, t1;
if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) {
transmission = 0;
break;
}
}
}
surfaceColor += sphere->surfaceColor * transmission *
max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor;
}
}
}
return surfaceColor + sphere->emissionColor;
}
extern "C" __global__
void raytrace_kernel(unsigned int width, unsigned int height, Vec3f *image, Sphere *spheres, unsigned int spheres_size, float invWidth, float invHeight, float aspectratio, float angle) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < height && x < width) {
float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio;
float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle;
Vec3f raydir(xx, yy, -1);
raydir.normalize();
image[y*width+x] = trace(Vec3f(0), raydir, spheres, spheres_size, 0);
}
}
I can successfully compile it with: nvcc --ptx kernel.cu -o kernel.ptx (full PTX here) and use that PTX in the driver API with cuModuleLoadDataEx using the following snippet. It works as expected.
It works fine even if I uncomment the #include <math.h> line (actually, the PTX generated is exactly the same).
CudaSafeCall( cuInit(0) );
CUdevice device;
CudaSafeCall( cuDeviceGet(&device, 0) );
CUcontext context;
CudaSafeCall( cuCtxCreate(&context, 0, device) );
unsigned int error_buffer_size = 1024;
std::vector<CUjit_option> options;
std::vector<void*> values;
char* error_log = new char[error_buffer_size];
options.push_back(CU_JIT_ERROR_LOG_BUFFER); //Pointer to a buffer in which to print any log messages that reflect errors
values.push_back(error_log);
options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
values.push_back(&error_buffer_size);
options.push_back(CU_JIT_TARGET_FROM_CUCONTEXT); //Determines the target based on the current attached context (default)
values.push_back(0); //No option value required for CU_JIT_TARGET_FROM_CUCONTEXT
CUmodule module;
CUresult status = cuModuleLoadDataEx(&module, ptxSource, options.size(), options.data(), values.data());
if (error_log && error_log[0]) { //https://stackoverflow.com/a/7970669/3136474
std::cout << "Compiler error: " << error_log << std::endl;
}
CudaSafeCall( status );
However, whenever I try to compile this exact kernel using NVRTC (full PTX here), it compiles successfully but gives me a Segmentation fault (core dumped) on the call to cuModuleLoadDataEx (when trying to use the resulting PTX).
If I uncomment the #include <math.h> line, it fails at the nvrtcCompileProgram call with the following output:
nvrtcSafeBuild() failed at cuda_raytracer_nvrtc_api.cpp:221 : NVRTC_ERROR_COMPILATION
Build log:
/usr/include/bits/mathcalls.h(177): error: linkage specification is incompatible with previous "isinf"
__nv_nvrtc_builtin_header.h(126689): here
/usr/include/bits/mathcalls.h(211): error: linkage specification is incompatible with previous "isnan"
__nv_nvrtc_builtin_header.h(126686): here
2 errors detected in the compilation of "kernel.cu".
The code I'm using to compile it with NVRTC is:
nvrtcProgram prog;
NvrtcSafeCall( nvrtcCreateProgram(&prog, kernelSource, "kernel.cu", 0, NULL, NULL) );
// https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
std::vector<const char*> compilationOpts;
compilationOpts.push_back("--device-as-default-execution-space");
// NvrtcSafeBuild is a macro which automatically prints nvrtcGetProgramLog if the compilation fails
NvrtcSafeBuild( nvrtcCompileProgram(prog, compilationOpts.size(), compilationOpts.data()), prog );
size_t ptxSize;
NvrtcSafeCall( nvrtcGetPTXSize(prog, &ptxSize) );
char* ptxSource = new char[ptxSize];
NvrtcSafeCall( nvrtcGetPTX(prog, ptxSource) );
NvrtcSafeCall( nvrtcDestroyProgram(&prog) );
Then I simply load the ptxSource using the previous snippet (note: that code block is the same used for both the driver API version and the NVRTC version).
Additional things that I've noticed/tried so far
The PTX generated by the NVCC and the one generated by NVRTC are quite different, but I'm unable to understand them to identify possible problems.
Tried to specify the specific GPU architecture (in my case, CC 6.1) to the compiler, no difference.
Tried to disable any compiler optimizations (options --ftz=false --prec-sqrt=true --prec-div=true --fmad=false in nvrtcCompileProgram). PTX file got bigger, but still Segfaulting.
Tried to add --std=c++11 or --std=c++14 to the NVRTC compiler options. With any of them NVRTC generates an almost empty (4 lines) PTX but issue no warning nor error until I try to use it.
Environment
SO: Ubuntu 18.04.4 LTS 64-bit
nvcc --version: Cuda compilation tools, release 10.1, V10.1.168. Built on Wed_Apr_24_19:10:27_PDT_2019
gcc --version: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Hardware: Intel I7-7700HQ, GeForce GTX 1050 Ti
Edit on OP+1 day
I forgot to add my environment. See previous section.
Also can you compile the nvrtc output with ptxas? – #talonmies' comment
The nvcc-generated PTX compiles with a warning:
$ ptxas -o /tmp/temp_ptxas_output.o kernel.ptx
ptxas warning : Stack size for entry function 'raytrace_kernel' cannot be statically determined
Which is due to the recursive kernel function (more on that).
It can be safely ignored.
The nvrtc-generated PTX does not compile and issues the error:
$ ptxas -o /tmp/temp_ptxas_output.o nvrtc_kernel.ptx
ptxas fatal : Unresolved extern function '_Z5powiffi'
Based on this question I added __device__ to Sphere class constructor and removed --device-as-default-execution-space compiler option.
It generates a slightly different PTX now, but still presents the same error.
Compiling with the #include <math.h> now generates a lot of "A function without execution space annotations is considered a host function, and host functions are not allowed in JIT mode." warnings besides the previous errors.
If I try to use the accepted solution of the question it throws me a bunch of syntax errors and does not compile. NVCC still works flawlessly.
Just found the culprit by the ancient comment-and-test method: the error goes away if I remove the pow call used to calculate the fresnel effect inside the trace method.
For now, I've just replaced pow(var, 3) for var*var*var.
I created a MVCE and filled a bug report to NVIDIA: https://developer.nvidia.com/nvidia_bug/2917596.
Which Liam Zhang answered and pointed me the problem:
The issue in your code is that there is an incorrect option value being passed to cuModuleLoadDataEx. In lines:
options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
values.push_back(&error_buffer_size);
the buffer size option is provided, but instead of passing a value with the size, a pointer to that value is passed. Since this pointer is then read as a number, the driver assumed a much larger buffer size than 1024.
During the NVRTC compilation a "Unresolved extern function" error occurred, because the pow function signature, as you can find in the documentation is:
__device__​ double pow ( double x, double y )
When the driver tried to zero the buffer when putting the error message in it, the segfault happened.
Without the call to pow, there was no compilation error, so the error buffer was not used and there was no segfault.
To ensure the device code is correct, the values used to call pow function as well as the output pointer should be a double number, or a float equivalent function, powf, could be used.
If I change the call to values.push_back((void*)error_buffer_size); it reports the same error as ptxas compilation of the generated PTX:
Compiler error: ptxas fatal : Unresolved extern function '_Z5powiffi'
cudaSafeCall() failed at file.cpp:74 : CUDA_ERROR_INVALID_PTX - a PTX JIT compilation failed

cuda:multiple threads access the same global variable

#define dimG 16
#define dimB 64
// slovebyGPU
__global__ void SloveStepGPU(float* X, float* Y, int * iCons, int* jCons, int * dCons, float* wCons, int cnt, float c)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = id; i<cnt; i += dimG*dimB) {
int I = iCons[i];
int J = jCons[i];
int d = dCons[i];
float wc = 1.0f*wCons[i]*c;
if (wc > 1.0)wc = 1.0;
float XI = atomicAdd(&(X[I]), 0);
float XJ = atomicAdd(&(X[J]), 0);
float YI = atomicAdd(&(Y[I]), 0);
float YJ = atomicAdd(&(Y[J]), 0);
float pqx = XI - XJ;
float pqy = YI - YJ;
float mag = sqrtf(pqx*pqx + pqy*pqy);
float r = 1.0f*(d - mag) / 2;
float mx = wc * r * pqx / (mag + eps);
float my = wc * r * pqy / (mag + eps);
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
atomicAdd(&(X[J]), -mx);
atomicAdd(&(Y[J]), -my);
}
In this code, I know that X, Y may have data races. My previous thought was: Allowed reading of XI, XJ, YI, YJ may not be the latest data. However, I found that in the process of data race, it may cause XI, XJ, YI, YJ to read random memory values. That is, a memory access violation. Even if I add a lock during reading and writing, I still get the same result. Only when I reduce the size of dimB and dimG so that there is almost no data race, can I get the correct result. Is there any solution?
I use 64-bit compilation under windows + vs2015 + cuda9.1 environment.
However, I used the same code under linux and found no problems.
There is no problem when using nsight cuda debugger under windows. The reason is probably that running with debugger is slow and does not cause data race.
-------update line-----
delete other code
The problem appeared in this if (d == 1), I replaced the if with the device function fminf,fmaxf and so on to solve the problem. I am guessing that the branch was entered in the same warp, and there was data competition and some processes were suspended, which caused strange problems.
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
to
float fd = fmaxf(2.0f - d, 0.0f);
X[I] += fd * 1.0f * mx;
Y[I] += fd * 1.0f * my;

CUDA Ray-Sphere intersection random walk spooky values

Results
The above results are the |X|Y|Z|AbsDistance of each sphere intersection, random spooky values appear probably because of a newbie mistake, but I really can't get it.
To be as specific as I can:
The following snippet is supposed to calculate the intersection point between a ray and a spherical boundary with a predefined radius and the origin as the center.
To give more context:
1- The RandomWalk starts from the origin and moves with a randomly generated _step and _direction.
2- After each step, the ray is checked for hitting possibility by comparing the absolute distance to the radius of the boundary.
3- getIntersectionPoint() returns the point of intersection, but as the number of points or number of steps increases, the probability of outcasts increases, messing up the whole thing.
Here's what I've done:
#include <curand.h>
#include <curand_kernel.h>
#include <iostream>
#define N 256 // Number of photons
#define THREADS_PER_BLOCK 256 // Threads per Block
#define BOUNDARY_RADIUS 5.0
class Point{
private:
float _x;
float _y;
float _z;
public:
__device__ __host__ Point(float x, float y, float z){
setCoordinates(x, y, z);
}
__device__ __host__ Point(){
setCoordinates(0.f, 0.f, 0.f);
}
__device__ __host__
void setCoordinates(float x, float y, float z)
{
this->_x = x;
this->_y = y;
this->_z = z;
}
__device__ __host__ float getX() const { return this->_x; }
__device__ __host__ float getY() const { return this->_y; }
__device__ __host__ float getZ() const { return this->_z; }
__device__ __host__
Point add(Point point){
float result_x = this->_x + point.getX();
float result_y = this->_y + point.getY();
float result_z = this->_z + point.getZ();
return Point( result_x, result_y, result_z );
}
__device__ __host__
Point subtract(Point point){
float result_x = this->_x - point.getX();
float result_y = this->_y - point.getY();
float result_z = this->_z - point.getZ();
return Point( result_x, result_y, result_z );
}
};
class RNG{
private:
__device__ float generate( curandState* globalState, int i)
{
curandState localState = globalState[i];
float random = curand_uniform( &localState );
globalState[i] = localState;
return random;
}
public:
__device__ float getRandomStep( curandState* globalState , int i) {
float step = 0.f; // Intialize for step value
step = generate (globalState, i);
return step;
}
__device__ Point getRandomPoint( curandState* globalState , int i)
{
float u = generate (globalState , i);
float v = generate (globalState, i);
float theta = 2 * M_PI * u;
float phi = acos(1 - 2 * v);
// Transforming into the cartesian space
float x = sin(phi) * cos(theta);
float y = sin(phi) * sin(theta);
float z = cos(phi);
return Point(x,y,z);
}
};
class Ray{
private:
Point _prevPos;
Point _currentPos;
Point _direction;
float _step;
public:
__device__ Ray(Point startingPoint, Point direction){
this->_currentPos.setCoordinates(startingPoint.getX(), startingPoint.getY(), startingPoint.getZ());
this->_direction.setCoordinates(direction.getX(), direction.getY(), direction.getZ());
}
__device__ void setDirection(Point direction) { this->_direction.setCoordinates(direction.getX(), direction.getY(), direction.getZ()); }
__device__ void setStep(float step) { this->_step = step; }
__device__ Point getCurrentPos() const { return this->_currentPos; }
__device__ Point getDirection() const { return this->_direction; }
__device__ Point getPrevPos() const { return this->_prevPos; }
__device__ float getStep() const { return this->_step; }
__device__ void move(Point direction, float step) // The point moves in the specified direction with the given step
{
this->_prevPos = this->_currentPos;
this->_direction = direction;
this->_step = step;
float newX = this->_currentPos.getX() + (direction.getX() * step);
float newY = this->_currentPos.getY() + (direction.getY() * step);
float newZ = this->_currentPos.getZ() + (direction.getZ() * step);
this->_currentPos.setCoordinates(newX, newY, newZ);
}
};
class Boundary{
private:
float _radius;
Point _center;
__device__
float dot(Point point1, Point point2){return point1.getX()*point2.getX() + point1.getY()*point2.getY() + point1.getZ()*point2.getZ();}
public:
__device__ __host__ Boundary(float r, Point c){
_radius = r;
_center = c;
}
__device__ bool isCrossed(Ray ray){
float absDistance = (float) sqrtf((float) powf(ray.getCurrentPos().getX(),2)
+ (float) powf(ray.getCurrentPos().getY(),2)
+ (float) powf(ray.getCurrentPos().getZ(),2));
if(absDistance >= _radius){
return true;
} else {
return false;
}
};
__device__ Point getIntersectionPoint(Ray ray){
Point A = ray.getPrevPos();
Point B = ray.getDirection();
Point S = A.add(_center);
Point A_C = A.subtract(_center);
float a = dot(B, B);
float b = 2.0 * dot(B, A_C);
float c = dot(A_C, A_C) - _radius*_radius;
float discriminant = b*b - 4*a*c;
float t1 = (-b + sqrtf(discriminant)) / (2.0*a);
float t2 = (-b - sqrtf(discriminant)) / (2.0*a);
float t;
if(t1 < 0){
t = t2;
} else {
t = t1;
}
return Point((A.getX()+B.getX()*t),(A.getY()+B.getY()*t),(A.getZ()+B.getZ()*t));
}
};
/**
* #brief randomWalk
* keeps wandering around with the photon in the 3D space
* #return The Point where the Photon hits the Boundary
*/
__device__ Point randomWalk(curandState_t *states, int idx, Boundary boundary, RNG rng)
{
Ray ray = Ray(Point(0.f, 0.f, 0.f), Point(0.f, 0.f, 0.f));
while (!boundary.isCrossed(ray))
{
ray.move(rng.getRandomPoint(states, idx), rng.getRandomStep(states, idx));
}
return boundary.getIntersectionPoint(ray);
}
void streamOut(Point* _cpuPoints);
__global__ void finalPosition(unsigned int seed, curandState_t* states, Point* _gpuPoints,Boundary boundary,RNG rng) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
curand_init(seed, idx, 0, &states[idx]);
Point finalPos;
finalPos = randomWalk(states, idx, boundary, rng);
_gpuPoints[idx] = finalPos;
}
int main() {
int nBlocks = N/THREADS_PER_BLOCK + 1;
curandState_t* states;
cudaMalloc((void**) &states, N * sizeof(curandState_t));
// Allocate host memory for final positions
Point * _cpuPoints= (Point*)malloc(sizeof(Point) * N);
// Allocate device memory for final positions
Point* _gpuPoints = nullptr;
cudaMalloc((void**) &_gpuPoints, N * sizeof(Point));
// Initializing the Boundary and the Random Number Generator
Boundary boundary = Boundary(BOUNDARY_RADIUS, Point(0.f, 0.f, 0.f));
RNG rng;
// Call Kernel
finalPosition<<<nBlocks,THREADS_PER_BLOCK>>>(time(0), states , _gpuPoints, boundary, rng);
// Copy device data to host memory to stream them out
cudaMemcpy(_cpuPoints, _gpuPoints, N* sizeof( Point), cudaMemcpyDeviceToHost);
streamOut (&_cpuPoints[0]);
free(_cpuPoints);
cudaFree(_gpuPoints);
return 0;
}
void streamOut(Point* _cpuPoints)
{
FILE *output;
output = fopen("output.csv", "w");
for (int i = 0; i < N; i++)
{
// Streaming out my output in a log file
float absDistance = (float) sqrtf((float) powf(_cpuPoints[i].getX(), 2)
+ (float) powf(_cpuPoints[i].getY(), 2)
+ (float) powf(_cpuPoints[i].getZ(), 2));
fprintf(output, "%f,%f,%f,%f\n", _cpuPoints[i].getX(), _cpuPoints[i].getY(), _cpuPoints[i].getZ(), absDistance);
}
}
Any time you are having trouble with a CUDA code, I recommend using proper CUDA error checking and run your code with cuda-memcheck. When I run your code with cuda-memcheck, I get a variety of errors. This means your kernel code is making illegal, out-of-bounds accesses. You can start to track this down using the method described here.
One problem in your code is that you are launching more blocks/threads than what your allocation size N dictates:
int nBlocks = N/THREADS_PER_BLOCK + 1;
This means some of the threads in your kernel launch will make out-of-bounds accesses. You need to address this with a thread check (if statement) in your kernel code.
When I take your code as posted, and modify the kernel like this:
__global__ void finalPosition(unsigned int seed, curandState_t* states, Point* _gpuPoints,Boundary boundary,RNG rng, int n) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n){
curand_init(seed, idx, 0, &states[idx]);
Point finalPos;
finalPos = randomWalk(states, idx, boundary, rng);
_gpuPoints[idx] = finalPos;}
}
and the kernel launch like this:
finalPosition<<<nBlocks,THREADS_PER_BLOCK>>>(time(0), states , _gpuPoints, boundary, rng, N);
I get this result (output.csv):
$ cat output.csv
0.628292,-4.899494,0.774730,5.000000
0.162323,-4.930647,-0.813861,5.000000
-1.715985,-0.534316,-4.665823,5.000000
-2.411644,-3.632435,-2.447323,5.000000
-3.418264,-0.851781,3.548231,5.000000
-2.850476,-2.937130,-2.871943,5.000000
0.072410,3.170733,-3.865386,5.000000
1.959057,-0.443189,-4.578829,5.000000
2.031133,-3.467616,-2.974919,5.000000
-2.107327,3.904619,2.305021,5.000000
-4.639953,1.667007,-0.831821,5.000000
3.720370,1.624804,2.918708,5.000000
-1.534095,-3.247724,3.478339,5.000000
-3.888582,0.315719,-3.127179,5.000000
-0.054493,-4.998784,-0.095864,5.000000
3.298623,3.518482,1.318854,5.000000
4.641367,-1.859068,-0.039635,5.000000
-3.671611,-0.072624,3.393228,5.000000
-1.256829,-0.310876,4.829465,5.000000
-2.492307,-4.182354,1.138562,5.000000
1.395312,-2.987793,3.758483,5.000000
-2.762215,-3.152503,2.726151,5.000000
3.101520,-1.983825,-3.383048,5.000000
-2.169484,3.941614,2.181059,5.000000
-3.971401,-1.138357,-2.816402,5.000000
-2.118435,-1.203381,4.366246,5.000000
3.319744,-3.698802,-0.546043,5.000000
2.737933,3.012805,-2.902883,5.000000
-2.870568,-0.945093,3.983295,5.000000
3.576528,-2.390892,2.547957,5.000000
4.602388,0.700673,1.824028,5.000000
0.122336,4.979045,-0.440617,5.000000
-0.935764,-1.534525,4.665789,5.000000
3.667711,-3.357755,0.522854,5.000000
-1.289282,1.290344,4.655402,5.000000
-3.764930,-3.280344,-0.254253,5.000000
4.267314,-0.811147,2.476302,5.000000
-3.693138,3.297244,-0.699224,5.000000
-1.038960,-2.293650,-4.319691,5.000000
-4.245689,0.974306,2.454558,5.000000
-3.710622,2.254789,2.479358,5.000000
-0.739412,-2.375453,-4.337107,5.000000
-1.122346,0.997810,4.769142,5.000000
4.641891,-1.289307,-1.338109,5.000000
3.943014,-2.680164,-1.506439,5.000000
-1.657783,0.458186,-4.694871,5.000000
2.903168,-3.962222,0.934030,5.000000
1.922109,4.382765,-1.448057,5.000000
2.943883,4.041326,0.035208,5.000000
3.264783,1.974566,-3.231452,5.000000
-3.273946,-2.057536,-3.169830,5.000000
0.055952,-3.367576,3.695442,5.000000
-0.072741,-4.568989,-2.029546,5.000000
-0.157276,4.870314,-1.120405,5.000000
1.299422,0.099700,4.827169,5.000000
2.791323,2.083337,3.587231,5.000000
-0.769589,2.674135,-4.154123,5.000000
-0.424974,2.674058,4.203428,5.000000
-1.297806,1.828922,4.468864,5.000000
1.356144,3.977489,-2.709327,5.000000
-4.020390,1.910192,2.277636,5.000000
0.859541,-4.891906,-0.574843,5.000000
0.760309,4.836938,-1.012894,5.000000
-4.918316,0.898741,-0.049279,5.000000
2.159176,-0.357519,4.495569,5.000000
1.337239,3.632694,-3.164700,5.000000
1.287019,3.640088,3.177001,5.000000
4.175551,-2.552966,1.023299,5.000000
-4.189130,-2.710545,0.322699,5.000000
-3.775866,0.422600,3.250269,5.000000
1.227863,1.939098,-4.442100,5.000000
-0.910808,-0.769251,-4.855789,5.000000
-2.836509,-4.018154,0.899253,5.000000
-0.943431,-4.248322,-2.462051,5.000000
4.839777,-0.542668,1.132283,5.000000
-0.543598,-4.860043,1.041386,5.000000
2.096293,0.731096,4.480072,5.000000
1.515222,-4.503112,1.557589,5.000000
0.391035,-2.820461,-4.109999,5.000000
-4.697918,1.659874,-0.417592,5.000000
0.731389,4.766176,1.322361,5.000000
-4.971092,0.391872,0.366991,5.000000
4.683945,1.503105,-0.895171,5.000000
0.094646,0.803327,4.934137,5.000000
-4.756599,-1.063119,1.115591,5.000000
-4.741367,0.601832,1.468751,5.000000
-0.622062,-4.399431,-2.293043,5.000000
3.998584,-2.430138,-1.762316,5.000000
2.889354,3.753414,-1.601098,5.000000
4.619578,-1.843518,0.510819,5.000000
-3.468601,-3.576452,-0.421662,5.000000
-2.446475,-3.452250,-2.663969,5.000000
0.611008,4.935348,-0.518658,5.000000
3.356182,3.689818,-0.348258,5.000000
2.723260,-4.022014,-1.186279,5.000000
2.515270,-3.615386,-2.366938,5.000000
1.461690,-4.704300,0.856170,5.000000
-1.220645,3.493145,3.362731,5.000000
-3.669620,3.239435,-1.019782,5.000000
-3.316329,1.182409,3.550193,5.000000
4.916836,-0.796389,0.436450,5.000000
-0.622584,-1.670654,4.671328,5.000000
-1.539724,3.515036,-3.205272,5.000000
-2.272659,3.932659,-2.090267,5.000000
1.659590,0.629000,-4.674411,5.000000
2.067366,-4.000756,2.172545,5.000000
-3.875296,1.900115,-2.524211,5.000000
3.605831,-3.310765,1.018244,5.000000
-0.772092,-4.551371,-1.920650,5.000000
2.968601,-4.023230,-0.032172,5.000000
-1.503622,-3.879141,2.773334,5.000000
-1.722315,-1.940946,-4.273916,5.000000
1.193075,3.128174,3.713637,5.000000
4.582112,1.741668,-0.985310,5.000000
-1.585273,-3.350112,-3.356137,5.000000
3.985136,-2.446971,-1.769469,5.000000
3.462019,2.040801,-2.974820,5.000000
2.336477,1.321345,4.218403,5.000000
-1.968305,4.097646,-2.082083,5.000000
3.373862,1.969776,3.120422,5.000000
-4.997004,0.112979,-0.131096,5.000000
-3.184446,1.498715,-3.551501,5.000000
-4.962571,0.586419,0.170300,5.000000
-2.533729,3.452926,-2.580216,5.000000
-0.292847,4.670508,1.760851,5.000000
4.836363,1.059029,0.698605,5.000000
2.820885,-4.074259,-0.665598,5.000000
-2.115496,4.106643,1.913154,5.000000
1.624954,3.679764,2.969656,5.000000
4.967940,-0.505954,-0.252150,5.000000
-4.672419,-1.567572,-0.843337,5.000000
3.070334,2.869947,-2.708589,5.000000
2.897243,3.452626,2.164568,5.000000
-3.926629,-1.834329,-2.493356,5.000000
1.167627,-3.817905,3.010024,5.000000
3.711214,2.119984,2.594717,5.000000
2.891797,0.411721,-4.058078,5.000000
-4.938633,-0.489490,0.608526,5.000000
2.090108,3.137994,3.283968,5.000000
-4.941360,0.246557,-0.722615,5.000000
3.025169,3.877938,-0.899971,5.000000
-0.057637,-1.374093,4.807135,5.000000
0.834437,4.757398,-1.292628,5.000000
-2.652762,1.304395,4.032544,5.000000
-2.801193,2.044116,3.602070,5.000000
3.026658,-3.871065,0.924228,5.000000
4.370097,2.405023,0.343677,5.000000
2.026850,-2.908674,-3.525833,5.000000
0.569686,-4.203772,-2.646461,5.000000
-4.491343,-1.708262,1.381909,5.000000
-2.891552,-2.594137,-3.147916,5.000000
4.539808,1.979339,-0.687284,5.000000
3.895631,-2.867161,1.266274,5.000000
-4.846499,-0.577102,-1.085540,5.000000
3.875701,-1.328161,-2.866169,5.000000
-3.538635,-1.043489,3.374788,5.000000
-0.142181,-4.997194,0.088522,5.000000
-1.737878,2.866842,-3.709582,5.000000
2.625108,-2.538683,3.415245,5.000000
-1.590130,-2.351951,4.115800,5.000000
-2.037973,-3.422420,3.022202,5.000000
1.821700,-3.040913,3.526224,5.000000
0.371202,4.985830,-0.060914,5.000000
4.683237,-0.619066,-1.638306,5.000000
2.398519,-4.361123,-0.477192,5.000000
3.776791,1.066108,3.098268,5.000000
-1.047291,2.938777,-3.907271,5.000000
-3.603310,1.989184,2.838891,5.000000
1.356899,3.938558,-2.765246,5.000000
4.458138,1.741543,1.446385,5.000000
-1.534129,2.878480,3.789565,5.000000
0.197157,-1.983445,-4.585529,5.000000
1.337347,4.006208,-2.676154,5.000000
1.458471,-3.795124,2.910309,5.000000
-0.761919,4.939224,0.153434,5.000000
4.058095,2.718427,1.068650,5.000000
-4.893477,-0.106711,1.021024,5.000000
-0.265158,4.754215,1.525495,5.000000
3.515460,3.011975,1.889325,5.000000
3.462154,-0.533415,3.567767,5.000000
-1.368027,3.607505,3.180316,5.000000
-3.604319,3.357449,0.858149,5.000000
-4.472452,1.961971,1.071376,5.000000
0.718252,-1.359473,-4.757725,5.000000
2.046570,4.513433,-0.663678,5.000000
-0.944693,1.298234,4.735203,5.000000
4.330425,-0.872982,-2.342077,5.000000
-3.978647,2.122892,2.159560,5.000000
0.277452,-0.264253,4.985297,5.000000
4.141907,-1.149088,2.554252,5.000000
-0.233733,-4.282661,2.569860,5.000000
-1.016823,1.102844,-4.769676,5.000000
2.170278,0.935831,4.406145,5.000000
-0.687388,0.576073,-4.918906,5.000000
-2.245424,-0.350784,4.453653,5.000000
1.744440,-2.652875,-3.862536,5.000000
-2.825023,3.965294,-1.138282,5.000000
4.314280,0.748011,-2.414015,5.000000
3.495596,-3.334958,1.287970,5.000000
1.075361,3.787970,3.081377,5.000000
-1.785661,-2.453979,-3.973588,5.000000
2.323098,4.402976,0.465858,5.000000
0.018438,-4.230187,-2.665554,5.000000
4.348341,-1.989394,1.460906,5.000000
-1.429905,4.039118,-2.576994,5.000000
-1.250704,0.451433,-4.819953,5.000000
0.167199,-2.760357,4.165630,5.000000
-2.622747,4.252591,-0.191487,5.000000
-4.118810,1.311523,2.513029,5.000000
0.877228,-4.601123,1.749324,5.000000
2.018730,-2.815201,-3.605464,5.000000
-1.223365,-4.307477,2.224640,5.000000
-1.950934,4.297880,1.649874,5.000000
-0.131930,-0.339385,4.986723,5.000000
1.781163,4.312893,1.796221,5.000000
1.854860,-4.261953,1.842621,5.000000
4.920663,0.883414,-0.081617,5.000000
-3.450862,-3.552723,-0.685352,5.000000
0.141668,-3.981927,-3.020627,5.000000
-0.710307,1.609614,-4.680235,5.000000
-0.913173,2.112117,4.439040,5.000000
3.299812,-3.403087,1.590674,5.000000
-4.896513,0.458815,-0.902026,5.000000
-4.692033,-1.111317,-1.322802,5.000000
1.252288,-1.454753,4.616868,5.000000
-4.081772,-2.074811,-2.008556,5.000000
-1.759481,-0.117314,4.678725,5.000000
4.809183,-0.342558,-1.324544,5.000000
0.667401,2.304548,4.386756,5.000000
0.739023,-3.974324,-2.942548,5.000000
-1.563058,4.086929,-2.419476,5.000000
1.173510,2.061344,4.401560,5.000000
0.668464,4.852658,-1.002429,5.000000
-4.419933,2.054977,1.114117,5.000000
-1.110164,-4.821075,-0.724410,5.000000
0.445619,0.343362,-4.968252,5.000000
3.654497,2.173541,2.630659,5.000000
-0.369989,0.458368,-4.965179,5.000000
2.885627,0.763048,4.011348,5.000000
-0.180556,4.945003,0.717179,5.000000
0.337894,1.747535,4.672467,5.000000
-1.756681,4.395498,1.610487,5.000000
3.864833,-2.428329,-2.041148,5.000000
-3.238250,2.699637,-2.688065,5.000000
3.534358,-3.506072,0.464514,5.000000
-4.732568,1.448340,0.710711,5.000000
-2.408191,1.378415,-4.159398,5.000000
-4.525957,-1.632861,-1.359957,5.000000
3.973403,2.903845,-0.883036,5.000000
-4.613844,1.607693,1.061963,5.000000
0.118624,-3.210011,3.831678,5.000000
4.347818,2.467214,-0.096607,5.000000
-3.273662,-3.779224,0.024393,5.000000
-1.548274,-1.125134,4.619190,5.000000
2.784302,-1.157474,-3.988473,5.000000
-3.413198,-3.552125,-0.855856,5.000000
which does not appear to have "spooky" values in it. (And the cuda-memcheck errors disappear.) If there are still "spooky" values, then you're going to need to provide a clearer definition of the results you expect.

openCl path tracer creates strange noise patterns

I've made a path tracer using openCl and c++, following the basic structure in this tutorial: http://raytracey.blogspot.com/2016/11/opencl-path-tracing-tutorial-2-path.html. As far as I can tell, nothing is wrong with the path tracing algorithm itself, but I get strange stripe patterns in the image that don't match the regular noise of path tracing. striped image
There are distinct vertical stripes and more narrow horizontal ones that make the image look granular regardless of how many samples I take per pixel. Again, pixel by pixel, the path tracer seems to be working (the outlines of objects are correct even where they appear mid-stripe) as seen here: close-up.
The only difference between my code and the one in the tutorial I link is that Sam Lapere appears to be using the c++ wrapper for openCl, and I've added a couple of features like movement. There also are a few differences in how I'm handling light bounces.
I'm new to openCl. What could be causing this? It seems like it doesn't have to do with my ray tracer itself, but somehow in the way I'm implementing openCl. I'm also using an SDL texture and renderer to show the image to the screen
here is the tracer code if it helps:
kernel:
__kernel void render_kernel
(__constant struct Sphere* spheres, const int width, const int height,
const int sphere_count, __global int * output, __global float3*
pixel_buckets, __global int* counter, __constant struct Ray* camera,
__global bool* reset){
int gid = get_global_id(0);
//for movement
if (*reset){
pixel_buckets[gid] = (float3)(0,0,0);
counter[gid] = 0;
}
int xcoord = gid % width;
int ycoord = gid / width;
struct Ray camray = createCamRay(xcoord, ycoord, width, height, counter[gid], camera);
float3 final_color = trace(spheres, &camray, sphere_count, xcoord, ycoord);
counter[gid] ++;
//average colors
pixel_buckets[gid] += final_color;
output[gid] = colorInt(clampColor(pixel_buckets[gid] / counter[gid]));
}
trace:
float3 trace(__constant struct Sphere* spheres, struct Ray* camray, const int sphere_count,
unsigned int seed0, unsigned int seed1){
struct Ray ray = *camray;
struct Sphere sphere1;
sphere1.center = (float3)(0, 0, 3);
sphere1.radius = 0.7;
sphere1.color = (float3)(1,1,0);
const int bounce_count = 8;
float3 colors[20];
float3 emiss[20];
for (int bounce = 0; bounce < bounce_count; bounce ++){
int sphere_id = 0;
float hit_distance = intersectScene(spheres, &ray, &sphere_id, sphere_count);
struct Sphere hit_sphere = spheres[sphere_id];
float3 hit_point = ray.origin + (ray.direction * hit_distance);
float3 normal = normalize(hit_point - hit_sphere.center);
if (dot(normal, -ray.direction) < 0){
normal = -normal;
}
//random bounce angles
float rand_theta = get_random(seed0, seed1);
float theta = acos(sqrt(rand_theta));
float rand_phi = get_random(seed0, seed1);
float phi = 2 * PI * rand_phi;
//scales the tnb vectors
float x = sin(theta) * sin(phi);
float y = sin(theta) * cos(phi);
float n = cos(theta);
float3 hemx = normalize(cross(ray.direction, normal)) * x;
float3 hemy = normalize(cross(hemx, normal)) * y;
normal = normal * n;
float3 new_ray = normalize(hemx + hemy + normal);
ray.origin = hit_point + (normal * EPSILON);
ray.direction = new_ray;
colors[bounce] = hit_sphere.color;
emiss[bounce] = hit_sphere.emmissive;
}
colors[bounce_count] = (float3)(0,0,0);
emiss[bounce_count] = (float3)(0,0,0);
for (int i = bounce_count - 1; i >= 0; i--){
colors[i] = (colors[i] * emiss[i]) + (colors[i] * colors[i + 1]);
}
return colors[0];
}
random number generator:
float get_random(unsigned int *seed0, unsigned int *seed1) {
/* hash the seeds using bitwise AND operations and bitshifts */
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
/* use union struct to convert int to float */
union {
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; /* bitwise AND, bitwise OR */
return (res.f - 2.0f) / 2.0f;
}
thanks

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.