OpenCL kernel only affecting portion of entire work-space - c++

I have been working on a 2D implementation of SPH as found here: http://www.cs.cornell.edu/~bindel/class/cs5220-f11/code/sph.pdf
I got it working CPU side, but found that without the ability to crank up the number of iterations, the quality of the simulation is not the best.
Hence I decided to port it to OpenCL 1.2 using the C++ bindings. My kernel is compiling, and the data is being written into and read from the buffers perfectly well.
However, due to my unfamiliarity with the GPU architechture and how index-spaces and work-groups are laid out, and the scarcity of resources that address this particular topic, I've been clawing in the dark somewhat when it comes to making sure that my kernel code is doing what I think it is doing.
The problem that I am encountering is that only one work-group of particles seem to be updated the way they should.
My guess is that I am updating acceleration wrong, but I am not certain how to accomplish it given that I need to compare every work-item to every other work item, and when I try to perform the operation through the naive method the simulation ends up exploding.
I am trying to iterate through the work-groups(blocks) and then through the individual work-items in each block, but the effect seems to be that only 1 block is ever updated.
Suggestions? Ideas? Resources?
Would welcome any input at this stage.
Thanks!
Code for the kernel is attached below.
void Density_Calculation(
float ConstantDensitySumTerm,
float ConstantDensityKernelTerm,
float eps,
float H2,
global float4* position,
global float* density,
local float4* pblock
)
{
// Id of this work-item in global index space
int gid = get_global_id(0);
// Id of this work-item within it's work group
int tid = get_local_id(0);
int globalSize = get_global_size(0);
int localSize = get_local_size(0);
int numTiles = globalSize/localSize;
// Zero out the density term of this work-item
density[gid] = 0;
density[gid] += ConstantDensitySumTerm;
float4 thisPosition = position[gid];
float densityTerm = 0.0;
// Outer loop iterates over all the work-group blocks
for(int i = 0; i < numTiles; ++i)
{
// Cache the particle position within the work-group
pblock[tid] = position[(i * localSize) + tid];
// synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
// Inner loop iterates over the work-items in each work-group, after all positions have been cached
for(int j = 0; j < localSize; ++j)
{
float4 otherPosition = pblock[j];
float4 deltaPosition = thisPosition - otherPosition;
float r2 = (deltaPosition.x * deltaPosition.x) + (deltaPosition.y * deltaPosition.y) + (deltaPosition.z * deltaPosition.z);
float z = (H2 - r2) + eps;
if(z > 0)
{
float rho_ij = ConstantDensityKernelTerm * z * z * z;
densityTerm += rho_ij;
}
}
// Synchronize so that next tile can be loaded
barrier(CLK_LOCAL_MEM_FENCE);
}
density[gid] += densityTerm;
}
void Acceleration_Calculation(
float eps,
float ConstantDensitySumTerm, float ConstantDensityKernelTerm,
float H2, float ReferenceDensity, float InteractionRadius,
float C0, float CP, float CV,
global float4* position,
global float4* velocity_full,
global float4* acceleration,
global float* density,
local float4* pblock
)
{
Density_Calculation(ConstantDensitySumTerm, ConstantDensityKernelTerm, eps, H2, position, density, pblock);
// Id of this work-item in global index space
int gid = get_global_id(0);
// Id of this work-item within it's work group
int tid = get_local_id(0);
int globalSize = get_global_size(0);
int localSize = get_local_size(0);
int numTiles = globalSize/localSize;
// Set acceleration parameters
//acceleration[gid].x = 0.0;
acceleration[gid].y = -0.01;
float4 thisPosition = position[gid];
float4 thisVelocity = velocity_full[gid];
float rhoi = density[gid];
float accelerationTermX = 0.0;
float accelerationTermY = 0.0;
for(int i = 0; i < numTiles; ++i)
{
for(int j = 0; j < localSize; ++j)
{
float4 otherPosition = position[j];
float4 deltaPosition = thisPosition - otherPosition;
float r2 = (deltaPosition.x * deltaPosition.x) + (deltaPosition.y * deltaPosition.y) + (deltaPosition.z * deltaPosition.z);
if(r2 < (H2 + eps))
{
float rhoj = density[j];
float q = sqrt(r2) / InteractionRadius;
float u = 1 - q;
float w0 = C0 * (u / rhoi / rhoj);
float wP = w0 * CP * (rhoi + rhoj - (2 * ReferenceDensity)) * (u / q);
float wV = w0 * CV;
float4 deltaVelocity = thisVelocity - velocity_full[j];
accelerationTermX += (wP * deltaPosition.x) + (wV * deltaVelocity.x);
accelerationTermY += (wP * deltaPosition.y) + (wV * deltaVelocity.y);
}
}
}
acceleration[gid].x += accelerationTermX;
acceleration[gid].y += accelerationTermY;
}
void LeapfrogIntegrator(
const float4 dt,
global float4* position,
global float4* velocity_full,
global float4* velocity_half,
global float4* acceleration
)
{
// Id of this work-item in global index space
int gid = get_global_id(0);
velocity_half[gid] = velocity_full[gid] + (acceleration[gid] * (dt/2));
velocity_full[gid] += acceleration[gid] * dt;
position[gid] += velocity_full[gid] * dt;
}
void kernel SPH_kernel(
float dt, float eps,
float ConstantDensitySumTerm, float ConstantDensityKernelTerm,
float H2, float ReferenceDensity, float InteractionRadius,
float C0, float CP, float CV,
global float4* position,
global float4* velocity_half,
global float4* velocity_full,
global float4* acceleration,
global float* density,
local float4* pblock
)
{
const float4 dt4 = (float4)(dt,dt,dt,0.0f);
Acceleration_Calculation(eps, ConstantDensitySumTerm, ConstantDensityKernelTerm, H2, ReferenceDensity, InteractionRadius, C0, CP, CV,
position, velocity_full, acceleration, density, pblock);
LeapfrogIntegrator(dt4, position, velocity_full, velocity_half, acceleration);
}

Related

Why isn't my 4 thread implementation faster than the single thread one?

I don't know much about multi-threading and I have no idea why this is happening so I'll just get to the point.
I'm processing an image and divide the image in 4 parts and pass each part to each thread(essentially I pass the indices of the first and last pixel rows of each part). For example, if the image has 1000 rows, each thread will process 250 of them. I can go in details about my implementation and what I'm trying to achieve in case it can help you. For now I provide the code executed by the threads in case you can detect why this is happening. I don't know if it's relevant but in both cases(1 thread or 4 threads) the process takes around 15ms and pfUMap and pbUMap are unordered maps.
void jacobiansThread(int start, int end,vector<float> &sJT,vector<float> &sJTJ) {
uchar* rgbPointer;
float* depthPointer;
float* sdfPointer;
float* dfdxPointer; float* dfdyPointer;
float fov = radians(45.0);
float aspect = 4.0 / 3.0;
float focal = 1 / (glm::tan(fov / 2));
float fu = focal * cols / 2 / aspect;
float fv = focal * rows / 2;
float strictFu = focal / aspect;
float strictFv = focal;
vector<float> pixelJacobi(6, 0);
for (int y = start; y <end; y++) {
rgbPointer = sceneImage.ptr<uchar>(y);
depthPointer = depthBuffer.ptr<float>(y);
dfdxPointer = dfdx.ptr<float>(y);
dfdyPointer = dfdy.ptr<float>(y);
sdfPointer = sdf.ptr<float>(y);
for (int x = roiX.x; x <roiX.y; x++) {
float deltaTerm;// = deltaPointer[x];
float raw = sdfPointer[x];
if (raw > 8.0)continue;
float dirac = (1.0f / float(CV_PI)) * (1.2f / (raw * 1.44f * raw + 1.0f));
deltaTerm = dirac;
vec3 rgb(rgbPointer[x * 3], rgbPointer[x * 3+1], rgbPointer[x * 3+2]);
vec3 bin = rgbToBin(rgb, numberOfBins);
int indexOfColor = bin.x * numberOfBins * numberOfBins + bin.y * numberOfBins + bin.z;
float s3 = glfwGetTime();
float pF = pfUMap[indexOfColor];
float pB = pbUMap[indexOfColor];
float heavisideTerm;
heavisideTerm = HEAVISIDE(raw);
float denominator = (heavisideTerm * pF + (1 - heavisideTerm) * pB) + 0.000001;
float commonFirstTerm = -(pF - pB) / denominator * deltaTerm;
if (pF == pB)continue;
vec3 pixel(x, y, depthPointer[x]);
float dfdxTerm = dfdxPointer[x];
float dfdyTerm = -dfdyPointer[x];
if (pixel.z == 1) {
cv::Point c = findClosestContourPoint(cv::Point(x, y), dfdxTerm, -dfdyTerm, abs(raw));
if (c.x == -1)continue;
pixel = vec3(c.x, c.y, depthBuffer.at<float>(cv::Point(c.x, c.y)));
}
vec3 point3D = pixel;
pixelToViewFast(point3D, cols, rows, strictFu, strictFv);
float Xc = point3D.x; float Xc2 = Xc * Xc; float Yc = point3D.y; float Yc2 = Yc * Yc; float Zc = point3D.z; float Zc2 = Zc * Zc;
pixelJacobi[0] = dfdyTerm * ((fv * Yc2) / Zc2 + fv) + (dfdxTerm * fu * Xc * Yc) / Zc2;
pixelJacobi[1] = -dfdxTerm * ((fu * Xc2) / Zc2 + fu) - (dfdyTerm * fv * Xc * Yc) / Zc2;
pixelJacobi[2] = -(dfdyTerm * fv * Xc) / Zc + (dfdxTerm * fu * Yc) / Zc;
pixelJacobi[3] = -(dfdxTerm * fu) / Zc;
pixelJacobi[4] = -(dfdyTerm * fv) / Zc;
pixelJacobi[5] = (dfdyTerm * fv * Yc) / Zc2 + (dfdxTerm * fu * Xc) / Zc2;
float weightingTerm = -1.0 / log(denominator);
for (int i = 0; i < 6; i++) {
pixelJacobi[i] *= commonFirstTerm;
sJT[i] += pixelJacobi[i];
}
for (int i = 0; i < 6; i++) {
for (int j = i; j < 6; j++) {
sJTJ[i * 6 + j] += weightingTerm * pixelJacobi[i] * pixelJacobi[j];
}
}
}
}
}
This is the part where I call each thread:
vector<std::thread> myThreads;
float step = (roiY.y - roiY.x) / numberOfThreads;
vector<vector<float>> tsJT(numberOfThreads, vector<float>(6, 0));
vector<vector<float>> tsJTJ(numberOfThreads, vector<float>(36, 0));
for (int i = 0; i < numberOfThreads; i++) {
int start = roiY.x+i * step;
int end = start + step;
if (end > roiY.y)end = roiY.y;
myThreads.push_back(std::thread(&pwp3dV2::jacobiansThread, this,start,end,std::ref(tsJT[i]), std::ref(tsJTJ[i])));
}
vector<float> sJT(6, 0);
vector<float> sJTJ(36, 0);
for (int i = 0; i < numberOfThreads; i++)myThreads[i].join();
Other Notes
To measure time I used glfwGetTime() before and right after the second code snippet. The measurements vary but the average is about 15ms as I mentioned, for both implementations.
Starting a thread has significant overhead, which might not be worth the time if you have only 15 milliseconds worth of work.
The common solution is to keep threads running in the background and send them data when you need them, instead of calling the std::thread constructor to create a new thread every time you have some work to do.
Pure spectaculation but two things might be preventing the full power of parallelization.
Processing speed is limited by the memory bus. Cores will wait until data is loaded before continuing.
Data sharing between cores. Some caches are core specific. If memory is shared between cores, data must traverse down to shared cache before loading.
On Linux you can use Perf to check for cache misses.
if you wanna better time you need to split a cycle runs from a counter, for this you need to do some preprocessing. some fast stuff like make an array of structures with headers for each segment or so. if say you can't mind anything better you can just do vector<int> with values of a counter. Then do for_each(std::execution::par,...) on that. way much faster.
for timings there's
auto t2 = std::chrono::system_clock::now();
std::chrono::milliseconds f = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);

cuda:multiple threads access the same global variable

#define dimG 16
#define dimB 64
// slovebyGPU
__global__ void SloveStepGPU(float* X, float* Y, int * iCons, int* jCons, int * dCons, float* wCons, int cnt, float c)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = id; i<cnt; i += dimG*dimB) {
int I = iCons[i];
int J = jCons[i];
int d = dCons[i];
float wc = 1.0f*wCons[i]*c;
if (wc > 1.0)wc = 1.0;
float XI = atomicAdd(&(X[I]), 0);
float XJ = atomicAdd(&(X[J]), 0);
float YI = atomicAdd(&(Y[I]), 0);
float YJ = atomicAdd(&(Y[J]), 0);
float pqx = XI - XJ;
float pqy = YI - YJ;
float mag = sqrtf(pqx*pqx + pqy*pqy);
float r = 1.0f*(d - mag) / 2;
float mx = wc * r * pqx / (mag + eps);
float my = wc * r * pqy / (mag + eps);
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
atomicAdd(&(X[J]), -mx);
atomicAdd(&(Y[J]), -my);
}
In this code, I know that X, Y may have data races. My previous thought was: Allowed reading of XI, XJ, YI, YJ may not be the latest data. However, I found that in the process of data race, it may cause XI, XJ, YI, YJ to read random memory values. That is, a memory access violation. Even if I add a lock during reading and writing, I still get the same result. Only when I reduce the size of dimB and dimG so that there is almost no data race, can I get the correct result. Is there any solution?
I use 64-bit compilation under windows + vs2015 + cuda9.1 environment.
However, I used the same code under linux and found no problems.
There is no problem when using nsight cuda debugger under windows. The reason is probably that running with debugger is slow and does not cause data race.
-------update line-----
delete other code
The problem appeared in this if (d == 1), I replaced the if with the device function fminf,fmaxf and so on to solve the problem. I am guessing that the branch was entered in the same warp, and there was data competition and some processes were suspended, which caused strange problems.
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
to
float fd = fmaxf(2.0f - d, 0.0f);
X[I] += fd * 1.0f * mx;
Y[I] += fd * 1.0f * my;

openCl path tracer creates strange noise patterns

I've made a path tracer using openCl and c++, following the basic structure in this tutorial: http://raytracey.blogspot.com/2016/11/opencl-path-tracing-tutorial-2-path.html. As far as I can tell, nothing is wrong with the path tracing algorithm itself, but I get strange stripe patterns in the image that don't match the regular noise of path tracing. striped image
There are distinct vertical stripes and more narrow horizontal ones that make the image look granular regardless of how many samples I take per pixel. Again, pixel by pixel, the path tracer seems to be working (the outlines of objects are correct even where they appear mid-stripe) as seen here: close-up.
The only difference between my code and the one in the tutorial I link is that Sam Lapere appears to be using the c++ wrapper for openCl, and I've added a couple of features like movement. There also are a few differences in how I'm handling light bounces.
I'm new to openCl. What could be causing this? It seems like it doesn't have to do with my ray tracer itself, but somehow in the way I'm implementing openCl. I'm also using an SDL texture and renderer to show the image to the screen
here is the tracer code if it helps:
kernel:
__kernel void render_kernel
(__constant struct Sphere* spheres, const int width, const int height,
const int sphere_count, __global int * output, __global float3*
pixel_buckets, __global int* counter, __constant struct Ray* camera,
__global bool* reset){
int gid = get_global_id(0);
//for movement
if (*reset){
pixel_buckets[gid] = (float3)(0,0,0);
counter[gid] = 0;
}
int xcoord = gid % width;
int ycoord = gid / width;
struct Ray camray = createCamRay(xcoord, ycoord, width, height, counter[gid], camera);
float3 final_color = trace(spheres, &camray, sphere_count, xcoord, ycoord);
counter[gid] ++;
//average colors
pixel_buckets[gid] += final_color;
output[gid] = colorInt(clampColor(pixel_buckets[gid] / counter[gid]));
}
trace:
float3 trace(__constant struct Sphere* spheres, struct Ray* camray, const int sphere_count,
unsigned int seed0, unsigned int seed1){
struct Ray ray = *camray;
struct Sphere sphere1;
sphere1.center = (float3)(0, 0, 3);
sphere1.radius = 0.7;
sphere1.color = (float3)(1,1,0);
const int bounce_count = 8;
float3 colors[20];
float3 emiss[20];
for (int bounce = 0; bounce < bounce_count; bounce ++){
int sphere_id = 0;
float hit_distance = intersectScene(spheres, &ray, &sphere_id, sphere_count);
struct Sphere hit_sphere = spheres[sphere_id];
float3 hit_point = ray.origin + (ray.direction * hit_distance);
float3 normal = normalize(hit_point - hit_sphere.center);
if (dot(normal, -ray.direction) < 0){
normal = -normal;
}
//random bounce angles
float rand_theta = get_random(seed0, seed1);
float theta = acos(sqrt(rand_theta));
float rand_phi = get_random(seed0, seed1);
float phi = 2 * PI * rand_phi;
//scales the tnb vectors
float x = sin(theta) * sin(phi);
float y = sin(theta) * cos(phi);
float n = cos(theta);
float3 hemx = normalize(cross(ray.direction, normal)) * x;
float3 hemy = normalize(cross(hemx, normal)) * y;
normal = normal * n;
float3 new_ray = normalize(hemx + hemy + normal);
ray.origin = hit_point + (normal * EPSILON);
ray.direction = new_ray;
colors[bounce] = hit_sphere.color;
emiss[bounce] = hit_sphere.emmissive;
}
colors[bounce_count] = (float3)(0,0,0);
emiss[bounce_count] = (float3)(0,0,0);
for (int i = bounce_count - 1; i >= 0; i--){
colors[i] = (colors[i] * emiss[i]) + (colors[i] * colors[i + 1]);
}
return colors[0];
}
random number generator:
float get_random(unsigned int *seed0, unsigned int *seed1) {
/* hash the seeds using bitwise AND operations and bitshifts */
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
/* use union struct to convert int to float */
union {
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; /* bitwise AND, bitwise OR */
return (res.f - 2.0f) / 2.0f;
}
thanks

Converting 2D Noise to 3D

I've recently started experimenting with noise (simple perlin noise), and have run into a slight problem with animating it. So far come I've across an awesome looking 3d noise (https://github.com/ashima/webgl-noise) that I could use in my project but that I understood nothing of, and a bunch of tutorials that explain how to create simple 2d noise.
For the 2d noise, I originally used the following fragment shader:
uniform sampler2D al_tex;
varying vec4 varying_pos; //Actual coords
varying vec2 varying_texcoord; //Normalized coords
uniform float time;
float rand(vec2 co) { return fract(sin(dot(co, vec2(12.9898, 78.233))) * 43758.5453); }
float ease(float p) { return 3*p*p - 2*p*p*p; }
float cnoise(vec2 p, int wavelength)
{
int ix1 = (int(varying_pos.x) / wavelength) * wavelength;
int iy1 = (int(varying_pos.y) / wavelength) * wavelength;
int ix2 = (int(varying_pos.x) / wavelength) * wavelength + wavelength;
int iy2 = (int(varying_pos.y) / wavelength) * wavelength + wavelength;
float x1 = ix1 / 1280.0f;
float y1 = iy1 / 720.0f;
float x2 = ix2 / 1280.0f;
float y2 = iy2 / 720.0f;
float xOffset = (varying_pos.x - ix1) / wavelength;
float yOffset = (varying_pos.y - iy1) / wavelength;
xOffset = ease(xOffset);
yOffset = ease(yOffset);
float t1 = rand(vec2(x1, y1));
float t2 = rand(vec2(x2, y1));
float t3 = rand(vec2(x2, y2));
float t4 = rand(vec2(x1, y2));
float tt1 = mix(t1, t2, xOffset);
float tt2 = mix(t4, t3, xOffset);
return mix(tt1, tt2, yOffset);
}
void main()
{
float t = 0;
int minFreq = 0;
int noIterations = 8;
for (int i = 0; i < noIterations; i++)
t += cnoise(varying_texcoord, int(pow(2, i + minFreq))) / pow(2, noIterations - i);
gl_FragColor = vec4(vec3(t), 1);
}
The result that I got was this:
Now, I want to animate it with time. My first thought was to change the rand function to take a vec3 instead of vec2, and then change my cnoise function accordingly, to interpolate values in the z direction too. With that goal in mind, I made this:
sampler2D al_tex;
varying vec4 varying_pos;
varying vec2 varying_texcoord;
uniform float time;
float rand(vec3 co) { return fract(sin(dot(co, vec3(12.9898, 78.2332, 58.5065))) * 43758.5453); }
float ease(float p) { return 3*p*p - 2*p*p*p; }
float cnoise(vec3 pos, int wavelength)
{
ivec3 iPos1 = (ivec3(pos) / wavelength) * wavelength; //The first value that I'll sample to interpolate
ivec3 iPos2 = iPos1 + wavelength; //The second value
vec3 transPercent = (pos - iPos1) / wavelength; //Transition percent - A float in [0-1) indicating how much of each of the above values will contribute to final result
transPercent.x = ease(transPercent.x);
transPercent.y = ease(transPercent.y);
transPercent.z = ease(transPercent.z);
float t1 = rand(vec3(iPos1.x, iPos1.y, iPos1.z));
float t2 = rand(vec3(iPos2.x, iPos1.y, iPos1.z));
float t3 = rand(vec3(iPos2.x, iPos2.y, iPos1.z));
float t4 = rand(vec3(iPos1.x, iPos2.y, iPos1.z));
float t5 = rand(vec3(iPos1.x, iPos1.y, iPos2.z));
float t6 = rand(vec3(iPos2.x, iPos1.y, iPos2.z));
float t7 = rand(vec3(iPos2.x, iPos2.y, iPos2.z));
float t8 = rand(vec3(iPos1.x, iPos2.y, iPos2.z));
float tt1 = mix(t1, t2, transPercent.x);
float tt2 = mix(t4, t3, transPercent.x);
float tt3 = mix(t5, t6, transPercent.x);
float tt4 = mix(t8, t7, transPercent.x);
float tt5 = mix(tt1, tt2, transPercent.y);
float tt6 = mix(tt3, tt4, transPercent.y);
return mix(tt5, tt6, transPercent.z);
}
float fbm(vec3 p)
{
float t = 0;
int noIterations = 8;
for (int i = 0; i < noIterations; i++)
t += cnoise(p, int(pow(2, i))) / pow(2, noIterations - i);
return t;
}
void main()
{
vec3 p = vec3(varying_pos.xy, time);
float t = fbm(p);
gl_FragColor = vec4(vec3(t), 1);
}
However, on doing this, the animation feels... strange. It's as though I'm watching a slideshow of perlin noise slides, with the individual slides fading in. All other perlin noise examples that I have tried (like https://github.com/ashima/webgl-noise) are actually animated with time - you can actually see it being animated, and don't just feel like the images are fading in, and not being actually animated. I know that I could just use the webgl-noise shader, but I want to make one for myself, and for some reason, I'm failing miserably. Could anyone tell me where I am going wrong, or suggest me on how I can actually animate it properly with time?
You should proably include z in the sin function:
float rand(vec3 co) { return fract(sin(dot(co.xy ,vec2(12.9898,78.233)) + co.z) * 43758.5453); }
Apparently the somewhat random numbers are prime numbers. This is to avoid patterns in the noise. I found another prime number, 94418953, and included that in the sin/dot function. Try this:
float rand(vec3 co) { return fract(sin(dot(co.xyz ,vec3(12.9898,78.233, 9441.8953))) * 43758.5453); }
EDIT: You don't take into account wavelength on the z axis. This means that all your iterations will have the same interpolation distance. In other words, you will get the fade effect you're describing. Try calculating z the same way you calculate x and y:
int iz1 = (int(p.z) / wavelength) * wavelength;
int iz2 = (int(p.z) / wavelength) * wavelength + wavelength;
float z1 = iz1 / 720.0f;
float z2 = iz2 / 720.0f;
float zOffset = (varying_pos.z - iz1) / wavelength;
This means however that the z value will variate the same rate that y will. So if you want it to scale from 0 to 1 then you should proably multiply z with 720 before passing it into the noise function.
check this code. it's a simple version of 3d noise:
// Here are some easy to understand noise gens... the D line in cubic interpolation (rounding)
function rndng ( n: float ): float
{//random proportion -1, 1 ... many people use Sin to take
//linearity out of a pseudo random, exp n*n is faster on central processor.
var e = ( n *321.9234)%1;
return (e*e*111.07546)%2-1;
}
function lerps(o:float, v:float, alpha:float):float
{
o += ( v - o ) * alpha;
return o;
}
//3d ----------------
function lnz ( vtx: Vector3 ): float //3d perlin noise code fast
{
vtx= Vector3 ( Mathf.Abs(vtx.x) , Mathf.Abs(vtx.y) , Mathf.Abs(vtx.z) ) ;
var I = Vector3 (Mathf.Floor(vtx.x),Mathf.Floor(vtx.y),Mathf.Floor(vtx.z));
var D = Vector3(vtx.x%1,vtx.y%1,vtx.z%1);
D = Vector3(D.x*D.x*(3.0-2.0*D.x),D.y*D.y*(3.0-2.0*D.y),D.z*D.z*(3.0-2.0*D.z));
var W = I.x + I.y*71.0 + 125.0*I.z;
return lerps(
lerps( lerps(rndng(W+0.0),rndng(W+1.0),D.x) , lerps(rndng(W+71.0),rndng(W+72.0),D.x) , D.y)
,
lerps( lerps(rndng(W+125.0),rndng(W+126.0),D.x) , lerps(rndng(W+153.0),rndng(W+154.0),D.x) , D.y)
,
D.z
);
}
//1d ----------------
function lnzo ( vtx: Vector3 ): float //perlin noise, same as unityfunction version
{
var total = 0.0;
for (var i:int = 1; i < 5; i ++)
{
total+= lnz2(Vector3 (vtx.x*(i*i),0.0,vtx.z*(i*i)))/(i*i);
}
return total*5;
}
//2d 3 axis honeycombe noise ----------------
function lnzh ( vtx: Vector3 ): float // perlin noise, 2d, with 3 axes at 60'instead of 2 x y axes
{
vtx= Vector3 ( Mathf.Abs(vtx.z) , Mathf.Abs(vtx.z*.5-vtx.x*.866) , Mathf.Abs(vtx.z*.5+vtx.x*.866) ) ;
var I = Vector3 (Mathf.Floor(vtx.x),Mathf.Floor(vtx.y),Mathf.Floor(vtx.z));
var D = Vector3(vtx.x%1,vtx.y%1,vtx.z%1);
//D = Vector3(D.x*D.x*(3.0-2.0*D.x),D.y*D.y*(3.0-2.0*D.y),D.z*D.z*(3.0-2.0*D.z));
var W = I.x + I.y*71.0 + 125.0*I.z;
return lerps(
lerps( lerps(rndng(W+0.0),rndng(W+1.0),D.x) , lerps(rndng(W+71.0),rndng(W+72.0),D.x) , D.y)
,
lerps( lerps(rndng(W+125.0),rndng(W+126.0),D.x) , lerps(rndng(W+153.0),rndng(W+154.0),D.x) , D.y)
,
D.z
);
}
//2d ----------------
function lnz2 ( vtx: Vector3 ): float // i think this is 2d perlin noise
{
vtx= Vector3 ( Mathf.Abs(vtx.x) , Mathf.Abs(vtx.y) , Mathf.Abs(vtx.z) ) ;
var I = Vector3 (Mathf.Floor(vtx.x),Mathf.Floor(vtx.y),Mathf.Floor(vtx.z));
var D = Vector3(vtx.x%1,vtx.y%1,vtx.z%1);
D = Vector3(D.x*D.x*(3.0-2.0*D.x),D.y*D.y*(3.0-2.0*D.y),D.z*D.z*(3.0-2.0*D.z));
var W = I.x + I.y*71.0 + 125.0*I.z;
return lerps(
lerps( lerps(rndng(W+0.0),rndng(W+1.0),D.x) , lerps(rndng(W+71.0),rndng(W+72.0),D.x) , D.z)
,
lerps( rndng(W+125.0), rndng(W+126.0),D.x)
,
D.z
);
}

Second iteration crash - order irrelevant

To save on global memory transfers, and because all of the steps of the code work individually, I have tried to combine all of the kernals into a single kernal, with the first 2 (of 3) steps being done as device calls rather than global calls.
This is failing in the second half of the first step.
There is a function that I need to call twice, to calculate the 2 halves of an image. Regardless of the order the image is calculated in, it crashes on the second iteration.
After examining the code as well as I could, and running it multiple times with different return points, I have found what makes it crash.
__device__
void IntersectCone( float* ModDistance,
float* ModIntensity,
float3 ray,
int threadID,
modParam param )
{
bool ignore = false;
float3 normal = make_float3(0.0f,0.0f,0.0f);
float3 result = make_float3(0.0f,0.0f,0.0f);
float normDist = 0.0f;
float intensity = 0.0f;
float check = abs( Dot(param.position, Cross(param.direction,ray) ) );
if(check > param.r1 && check > param.r2)
ignore = true;
float tran = param.length / (param.r2/param.r1 - 1);
float length = tran + param.length;
float Lsq = length * length;
float cosSqr = Lsq / (Lsq + param.r2 * param.r2);
//Changes the centre position?
float3 position = param.position - tran * param.direction;
float aDd = Dot(param.direction, ray);
float3 e = position * -1.0f;
float aDe = Dot(param.direction, e);
float dDe = Dot(ray, e);
float eDe = Dot(e, e);
float c2 = aDd * aDd - cosSqr;
float c1 = aDd * aDe - cosSqr * dDe;
float c0 = aDe * aDe - cosSqr * eDe;
float discr = c1 * c1 - c0 * c2;
if(discr <= 0.0f)
ignore = true;
if(!ignore)
{
float root = sqrt(discr);
float sign;
if(c1 > 0.0f)
sign = 1.0f;
else
sign = -1.0f;
//Try opposite sign....?
float3 result = (-c1 + sign * root) * ray / c2;
e = result - position;
float dot = Dot(e, param.direction);
float3 s1 = Cross(e, param.direction);
float3 normal = Cross(e, s1);
if( (dot > tran) || (dot < length) )
{
if(Dot(normal,ray) <= 0)
{
normal = Norm(normal); //This stuff (1)
normDist = Magnitude(result);
intensity = -IntensAt1m * Dot(ray, normal) / (normDist * normDist);
}
}
}
ModDistance[threadID] = normDist; and this stuff (2)
ModIntensity[threadID] = intensity;
}
There are two things I can do to to make this not crash, both off which negate the point of the function: If I do not try to write to ModDistance[] and ModIntensity[], or if I do not write to normDist and intensity.
First chance exceptions are thrown by the code above, but not if either of the blocks commented out.
Also, The program only crashes the second time this routine is called.
Have been trying to figure this out all day, any help would be fantastic.
The code that calls it is:
int subrow = threadIdx.y + Mod_Height/2;
int threadID = subrow * (Mod_Width+1) + threadIdx.x;
int obsY = windowY + subrow;
float3 ray = CalculateRay(obsX,obsY);
if( !IntersectSphere(ModDistance, ModIntensity, ray, threadID, param) )
{
IntersectCone(ModDistance, ModIntensity, ray, threadID, param);
}
subrow = threadIdx.y;
threadID = subrow * (Mod_Width+1) + threadIdx.x;
obsY = windowY + subrow;
ray = CalculateRay(obsX,obsY);
if( !IntersectSphere(ModDistance, ModIntensity, ray, threadID, param) )
{
IntersectCone(ModDistance, ModIntensity, ray, threadID, param);
}
The kernel is running out of resources. As posted in the comments, it was giving the error CudaErrorLaunchOutOfResources.
To avoid this, you should use a __launch_bounds__ specifier to specify the block dimensions you want for your kernel. This will force the compiler to ensure there are enough resources. See the CUDA programming guide for details on __launch_bounds__.