cuda:multiple threads access the same global variable - c++

#define dimG 16
#define dimB 64
// slovebyGPU
__global__ void SloveStepGPU(float* X, float* Y, int * iCons, int* jCons, int * dCons, float* wCons, int cnt, float c)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = id; i<cnt; i += dimG*dimB) {
int I = iCons[i];
int J = jCons[i];
int d = dCons[i];
float wc = 1.0f*wCons[i]*c;
if (wc > 1.0)wc = 1.0;
float XI = atomicAdd(&(X[I]), 0);
float XJ = atomicAdd(&(X[J]), 0);
float YI = atomicAdd(&(Y[I]), 0);
float YJ = atomicAdd(&(Y[J]), 0);
float pqx = XI - XJ;
float pqy = YI - YJ;
float mag = sqrtf(pqx*pqx + pqy*pqy);
float r = 1.0f*(d - mag) / 2;
float mx = wc * r * pqx / (mag + eps);
float my = wc * r * pqy / (mag + eps);
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
atomicAdd(&(X[J]), -mx);
atomicAdd(&(Y[J]), -my);
}
In this code, I know that X, Y may have data races. My previous thought was: Allowed reading of XI, XJ, YI, YJ may not be the latest data. However, I found that in the process of data race, it may cause XI, XJ, YI, YJ to read random memory values. That is, a memory access violation. Even if I add a lock during reading and writing, I still get the same result. Only when I reduce the size of dimB and dimG so that there is almost no data race, can I get the correct result. Is there any solution?
I use 64-bit compilation under windows + vs2015 + cuda9.1 environment.
However, I used the same code under linux and found no problems.
There is no problem when using nsight cuda debugger under windows. The reason is probably that running with debugger is slow and does not cause data race.
-------update line-----
delete other code

The problem appeared in this if (d == 1), I replaced the if with the device function fminf,fmaxf and so on to solve the problem. I am guessing that the branch was entered in the same warp, and there was data competition and some processes were suspended, which caused strange problems.
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
to
float fd = fmaxf(2.0f - d, 0.0f);
X[I] += fd * 1.0f * mx;
Y[I] += fd * 1.0f * my;

Related

Interpreting visual studio profiler, is this subtraction slow? Can I make all this any faster?

I'm using the Visual Studio profiler for the first time and I'm trying to interpret the results. Looking at the percentages on the left, I found this subtraction's time cost a bit strange:
Other parts of the code contain more complex expressions, like:
Even a simple multiplication seems way faster than the subtraction :
Other multiplications take way longer and I really don't get why, like this :
So, I guess my question is if there is anything weird going on here.
Complex expressions take longer than that subtraction and some expressions take way longer than similar other ones. I run the profiler several times and the distribution of the percentages is always like this. Am I just interpreting this wrong?
Update:
I was asked to give the profile for the whole function so here it is, even though it's a bit big. I ran the function inside a for loop for 1 minute and got 50k samples. The function contains a double loop. I include the text first for ease, followed by the pictures of profiling. Note that the code in text is a bit updated.
for (int i = 0; i < NUMBER_OF_CONTOUR_POINTS; i++) {
vec4 contourPointV(contour3DPoints[i], 1);
float phi = angles[i];
float xW = pose[0][0] * contourPointV.x + pose[1][0] * contourPointV.y + contourPointV.z * pose[2][0] + pose[3][0];
float yW = pose[0][1] * contourPointV.x + pose[1][1] * contourPointV.y + contourPointV.z * pose[2][1] + pose[3][1];
float zW = pose[0][2] * contourPointV.x + pose[1][2] * contourPointV.y + contourPointV.z * pose[2][2] + pose[3][2];
float x = -G_FU_STRICT * xW / zW;
float y = -G_FV_STRICT * yW / zW;
x = (x + 1) * G_WIDTHo2;
y = (y + 1) * G_HEIGHTo2;
y = G_HEIGHT - y;
phi -= extraTheta;
if (phi < 0)phi += CV_PI2;
int indexForTable = phi * oneKoverPI;
//vec2 ray(cos(phi), sin(phi));
vec2 ray(cos_pre[indexForTable], sin_pre[indexForTable]);
vec2 ray2(-ray.x, -ray.y);
float outerStepX = ray.x * step;
float outerStepY = ray.y * step;
cv::Point2f outerPoint(x + outerStepX, y + outerStepY);
cv::Point2f innerPoint(x - outerStepX, y - outerStepY);
cv::Point2f contourPointCV(x, y);
cv::Point2f contourPointCVcopy(x, y);
bool cut = false;
if (!isInView(outerPoint.x, outerPoint.y) || !isInView(innerPoint.x, innerPoint.y)) {
cut = true;
}
bool outside2 = true; bool outside1 = true;
if (cut) {
outside2 = myClipLine(contourPointCV.x, contourPointCV.y, outerPoint.x, outerPoint.y, G_WIDTH - 1, G_HEIGHT - 1);
outside1 = myClipLine(contourPointCVcopy.x, contourPointCVcopy.y, innerPoint.x, innerPoint.y, G_WIDTH - 1, G_HEIGHT - 1);
}
myIterator innerRayMine(contourPointCVcopy, innerPoint);
myIterator outerRayMine(contourPointCV, outerPoint);
if (!outside1) {
innerRayMine.end = true;
innerRayMine.prob = true;
}
if (!outside2) {
outerRayMine.end = true;
innerRayMine.prob = true;
}
vec2 normal = -ray;
float dfdxTerm = -normal.x;
float dfdyTerm = normal.y;
vec3 point3D = vec3(xW, yW, zW);
cv::Point contourPoint((int)x, (int)y);
float Xc = point3D.x; float Xc2 = Xc * Xc; float Yc = point3D.y; float Yc2 = Yc * Yc; float Zc = point3D.z; float Zc2 = Zc * Zc;
float XcYc = Xc * Yc; float dfdxFu = dfdxTerm * G_FU; float dfdyFv = dfdyTerm * G_FU; float overZc2 = 1 / Zc2; float overZc = 1 / Zc;
pixelJacobi[0] = (dfdyFv * (Yc2 + Zc2) + dfdxFu * XcYc) * overZc2;
pixelJacobi[1] = (-dfdxFu * (Xc2 + Zc2) - dfdyFv * XcYc) * overZc2;
pixelJacobi[2] = (-dfdyFv * Xc + dfdxFu * Yc) * overZc;
pixelJacobi[3] = -dfdxFu * overZc;
pixelJacobi[4] = -dfdyFv * overZc;
pixelJacobi[5] = (dfdyFv * Yc + dfdxFu * Xc) * overZc2;
float commonFirstTermsSum = 0;
float commonFirstTermsSquaredSum = 0;
int test = 0;
while (!innerRayMine.end) {
test++;
cv::Point xy = innerRayMine.pos(); innerRayMine++;
int x = xy.x;
int y = xy.y;
float dx = x - contourPoint.x;
float dy = y - contourPoint.y;
vec2 dxdy(dx, dy);
float raw = -glm::dot(dxdy, normal);
float heavisideTerm = heaviside_pre[(int)raw * 100 + 1000];
float deltaTerm = delta_pre[(int)raw * 100 + 1000];
const Vec3b rgb = ante[y * 640 + x];
int red = rgb[0]; int green = rgb[1]; int blue = rgb[2];
red = red >> 3; red = red << 10; green = green >> 3; green = green << 5; blue = blue >> 3;
int colorIndex = red + green + blue;
pF = pFPointer[colorIndex];
pB = pBPointer[colorIndex];
float denAsMul = 1 / (pF + pB + 0.000001);
pF = pF * denAsMul;
float pfMinusPb = 2 * pF - 1;
float denominator = heavisideTerm * (pfMinusPb)+pB + 0.000001;
float commonFirstTerm = -pfMinusPb / denominator * deltaTerm;
commonFirstTermsSum += commonFirstTerm;
commonFirstTermsSquaredSum += commonFirstTerm * commonFirstTerm;
}
}
Visual Studio profiles by sampling: it interrupts execution often and records the value of the instruction pointer; it then maps it to the source and calculates the frequency of hitting that line.
There are few issues with that: it's not always possible to figure out which line produced a specific assembly instruction in the optimized code.
One trick I use is to move the code of interest into a separate function and declare it with __declspec(noinline) .
In your example, are you sure the subtraction was performed as many times as multiplication? I would be more puzzled by the difference in subsequent multiplication (0.39% and 0.53%)
Update:
I believe that the following lines:
float phi = angles[i];
and
phi -= extraTheta;
got moved together in assembly and the time spent getting angles[i] was added to that subtraction line.

Why isn't my 4 thread implementation faster than the single thread one?

I don't know much about multi-threading and I have no idea why this is happening so I'll just get to the point.
I'm processing an image and divide the image in 4 parts and pass each part to each thread(essentially I pass the indices of the first and last pixel rows of each part). For example, if the image has 1000 rows, each thread will process 250 of them. I can go in details about my implementation and what I'm trying to achieve in case it can help you. For now I provide the code executed by the threads in case you can detect why this is happening. I don't know if it's relevant but in both cases(1 thread or 4 threads) the process takes around 15ms and pfUMap and pbUMap are unordered maps.
void jacobiansThread(int start, int end,vector<float> &sJT,vector<float> &sJTJ) {
uchar* rgbPointer;
float* depthPointer;
float* sdfPointer;
float* dfdxPointer; float* dfdyPointer;
float fov = radians(45.0);
float aspect = 4.0 / 3.0;
float focal = 1 / (glm::tan(fov / 2));
float fu = focal * cols / 2 / aspect;
float fv = focal * rows / 2;
float strictFu = focal / aspect;
float strictFv = focal;
vector<float> pixelJacobi(6, 0);
for (int y = start; y <end; y++) {
rgbPointer = sceneImage.ptr<uchar>(y);
depthPointer = depthBuffer.ptr<float>(y);
dfdxPointer = dfdx.ptr<float>(y);
dfdyPointer = dfdy.ptr<float>(y);
sdfPointer = sdf.ptr<float>(y);
for (int x = roiX.x; x <roiX.y; x++) {
float deltaTerm;// = deltaPointer[x];
float raw = sdfPointer[x];
if (raw > 8.0)continue;
float dirac = (1.0f / float(CV_PI)) * (1.2f / (raw * 1.44f * raw + 1.0f));
deltaTerm = dirac;
vec3 rgb(rgbPointer[x * 3], rgbPointer[x * 3+1], rgbPointer[x * 3+2]);
vec3 bin = rgbToBin(rgb, numberOfBins);
int indexOfColor = bin.x * numberOfBins * numberOfBins + bin.y * numberOfBins + bin.z;
float s3 = glfwGetTime();
float pF = pfUMap[indexOfColor];
float pB = pbUMap[indexOfColor];
float heavisideTerm;
heavisideTerm = HEAVISIDE(raw);
float denominator = (heavisideTerm * pF + (1 - heavisideTerm) * pB) + 0.000001;
float commonFirstTerm = -(pF - pB) / denominator * deltaTerm;
if (pF == pB)continue;
vec3 pixel(x, y, depthPointer[x]);
float dfdxTerm = dfdxPointer[x];
float dfdyTerm = -dfdyPointer[x];
if (pixel.z == 1) {
cv::Point c = findClosestContourPoint(cv::Point(x, y), dfdxTerm, -dfdyTerm, abs(raw));
if (c.x == -1)continue;
pixel = vec3(c.x, c.y, depthBuffer.at<float>(cv::Point(c.x, c.y)));
}
vec3 point3D = pixel;
pixelToViewFast(point3D, cols, rows, strictFu, strictFv);
float Xc = point3D.x; float Xc2 = Xc * Xc; float Yc = point3D.y; float Yc2 = Yc * Yc; float Zc = point3D.z; float Zc2 = Zc * Zc;
pixelJacobi[0] = dfdyTerm * ((fv * Yc2) / Zc2 + fv) + (dfdxTerm * fu * Xc * Yc) / Zc2;
pixelJacobi[1] = -dfdxTerm * ((fu * Xc2) / Zc2 + fu) - (dfdyTerm * fv * Xc * Yc) / Zc2;
pixelJacobi[2] = -(dfdyTerm * fv * Xc) / Zc + (dfdxTerm * fu * Yc) / Zc;
pixelJacobi[3] = -(dfdxTerm * fu) / Zc;
pixelJacobi[4] = -(dfdyTerm * fv) / Zc;
pixelJacobi[5] = (dfdyTerm * fv * Yc) / Zc2 + (dfdxTerm * fu * Xc) / Zc2;
float weightingTerm = -1.0 / log(denominator);
for (int i = 0; i < 6; i++) {
pixelJacobi[i] *= commonFirstTerm;
sJT[i] += pixelJacobi[i];
}
for (int i = 0; i < 6; i++) {
for (int j = i; j < 6; j++) {
sJTJ[i * 6 + j] += weightingTerm * pixelJacobi[i] * pixelJacobi[j];
}
}
}
}
}
This is the part where I call each thread:
vector<std::thread> myThreads;
float step = (roiY.y - roiY.x) / numberOfThreads;
vector<vector<float>> tsJT(numberOfThreads, vector<float>(6, 0));
vector<vector<float>> tsJTJ(numberOfThreads, vector<float>(36, 0));
for (int i = 0; i < numberOfThreads; i++) {
int start = roiY.x+i * step;
int end = start + step;
if (end > roiY.y)end = roiY.y;
myThreads.push_back(std::thread(&pwp3dV2::jacobiansThread, this,start,end,std::ref(tsJT[i]), std::ref(tsJTJ[i])));
}
vector<float> sJT(6, 0);
vector<float> sJTJ(36, 0);
for (int i = 0; i < numberOfThreads; i++)myThreads[i].join();
Other Notes
To measure time I used glfwGetTime() before and right after the second code snippet. The measurements vary but the average is about 15ms as I mentioned, for both implementations.
Starting a thread has significant overhead, which might not be worth the time if you have only 15 milliseconds worth of work.
The common solution is to keep threads running in the background and send them data when you need them, instead of calling the std::thread constructor to create a new thread every time you have some work to do.
Pure spectaculation but two things might be preventing the full power of parallelization.
Processing speed is limited by the memory bus. Cores will wait until data is loaded before continuing.
Data sharing between cores. Some caches are core specific. If memory is shared between cores, data must traverse down to shared cache before loading.
On Linux you can use Perf to check for cache misses.
if you wanna better time you need to split a cycle runs from a counter, for this you need to do some preprocessing. some fast stuff like make an array of structures with headers for each segment or so. if say you can't mind anything better you can just do vector<int> with values of a counter. Then do for_each(std::execution::par,...) on that. way much faster.
for timings there's
auto t2 = std::chrono::system_clock::now();
std::chrono::milliseconds f = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

Why are my openGL ellipses pointed?

I copied this ellipse code directly from the opengl textbook:
void ellipseMidpoint (int xCenter, int yCenter, int Rx, int Ry)
{
int Rx2 = Rx * Rx;
int Ry2 = Ry * Ry;
int twoRx2 = 2 * Rx2;
int twoRy2 = 2 * Ry2;
int p;
int x = 0;
int y = Ry;
int px = 0;
int py = twoRx2 * y;
//initial points in both quadrants
ellipsePlotPoints (xCenter, yCenter, x, y);
//Region 1
p = round (Ry2 - (Rx2 * Ry) + (0.25 * Rx2));
while (px < py) {
x++;
px += twoRy2;
if (p < 0)
p += Ry2 + px;
else {
y--;
py -= twoRx2;
p += Ry2 + px - py;
}
ellipsePlotPoints (xCenter, yCenter, x, y);
}
//Region 2
p = round (Ry2 * (x+0.5) * (x+0.5) + Rx2 * (y-1) * (y-1) - Rx2 * Ry2);
while (y > 0) {
y--;
py -= twoRx2;
if (p > 0)
p += Rx2 - py;
else {
x++;
px += twoRy2;
p += Rx2 - py + px;
}
ellipsePlotPoints (xCenter, yCenter, x, y);
}
}
void ellipsePlotPoints (int xCenter, int yCenter, int x, int y)
{
setPixel (xCenter + x, yCenter + y);
setPixel (xCenter - x, yCenter + y);
setPixel (xCenter + x, yCenter - y);
setPixel (xCenter - x, yCenter - y);
}
void setPixel (GLint xPos, GLint yPos)
{
glBegin (GL_POINTS);
glVertex2i(xPos, yPos);
glEnd();
}
The smaller ellipses seem to be fine but the larger ones are pointy and sort of flat at the ends.
Any ideas why?
Here is a current screenshot:
I think you're encountering overflow. I played with your code. While I never saw exactly the same "lemon" type shapes from your pictures, things definitely fell apart at large sizes, and it was caused by overflowing the range of the int variables used in the code.
For example, look at one of the first assignments:
int py = twoRx2 * y;
If you substitute, this becomes:
int py = 2 * Rx * Rx * Ry;
If you use a value of 1000 each for Rx and Ry, this is 2,000,000,000. Which is very close to the 2^31 - 1 top of the range of a 32-bit int.
If you want to use this algorithm for larger sizes, you could use 64-bit integer variables. Depending on your system, the type would be long or long long. Or more robustly, int64_t after including <stdint.h>.
Now, if all you want to do is draw an ellipsis with OpenGL, there are much better ways. The Bresenham type algorithms used in your code are ideal if you need to draw a curve pixel by pixel. But OpenGL is a higher level API, which knows how to render more complex primitives than just pixels. For a curve, you will most typically use a connected set of line segments to approximate the curve. OpenGL will then take care of turning those line segments into pixels.
The simplest way to draw an ellipsis is to directly apply the parametric representation. With phi an angle between 0 and PI, and using the naming from your code, the points on the ellipsis are:
x = xCenter + Rx * cos(phi)
y = yCenter + Ry * sin(phi)
You can use an increment for phi that meets your precision requirements, and the code will look something to generate an ellipsis approximated by DIV_COUNT points will look something like this:
float angInc = 2.0f * m_PI / (float)DIV_COUNT;
float ang = 0.0f;
glBegin(GL_LINE_LOOP);
for (int iDiv = 0; iDiv < DIV_COUNT; ++iDiv) {
ang += angInc;
float x = xCenter + Rx * cos(ang);
float y = yCenter + Ry * sin(ang);
glVertex2f(x, y);
glEnd();
If you care about efficiency, you can avoid calculating the trigonometric functions for each point, and apply an incremental rotation to calculate each point from the previous one:
float angInc = 2.0f * M_PI / (float)DIV_COUNT;
float cosInc = cos(angInc);
float sinInc = sin(angInc);
float cosAng = 1.0f;
float sinAng = 0.0f
glBegin(GL_LINE_LOOP);
for (int iDiv = 0; iDiv < DIV_COUNT; ++iDiv) {
float newCosAng = cosInc * cosAng - sinInc * sinAng;
sinAng = sinInc * cosAng + cosInc * sinAng;
cosAng = newCosAng;
float x = xCenter + Rx * cosAng;
float y = yCenter + Ry * sinAng;
glVertex2f(x, y);
glEnd();
This code is of course just for illustrating the math, and to get you started. In reality, you should use current OpenGL rendering methods, which includes vertex buffers, etc.

C++ and CUDA: why does the code return different results each time?

Update: I found the bug. Since the code I posted before is very complicated, I simplify them and only keep the part when the problem is.
if (number >= dim * num_points)
return;
But actually, I only have num_points, I want to use num_points thread, so the correct way should be
if (number >= num_points)
return;
Thank you all for the help.
I'm rewriting some C++ code from CPU to GPU. And the code is pasted below. Sorry it's long, since I think the problems are easier to be detected in this way.
In the code, for every thread I need some matrix format intermediate results, so I allocate device memory for these intermediate results, such as d_dir2, d_R, d_Stick, d_PStick. The results turned out to be not what I expected, so to debug, I tried to output some intermediate results R in this way:
if (k == 0)
{
results[tmp_int1 + i * dim + j] = R[tmp_int1 + i * dim + j];
}
and later in C++, I print results.
However, I found that results give different values each time. Sometimes it gives the correct answer R, sometimes, the value of PStick, sometimes a combination of R and PStick, and sometimes a combination of R and 0 (results are initialized to 0 at the beginning).
I'm very confused what caused the problem. Any idea? Thank you very much :)
__global__ void stickvote(const int dim, const int num_points, const int gridx, float Sigma, float* input, float* dir2, float* R, float* Stick, float* PStick, float* results) {
float threshold = 4 * Sigma;
float c = (- 16 * log(0.1f) * (sqrt(Sigma) - 1)) / 3.1415926f / 3.1415926f;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int number = row * BLOCK_SIZE * gridx + col;
if (number >= dim * num_points) //// The bug is here!
return;
}
extern "C" void KernelStickVote(int dim, int num_points, float Sigma, float* input, float* results) {
const int totalpoints = num_points;
const int totalpoints_input = (dim + 1)* (dim + 1) * num_points;
const int totalpoints_output = dim * dim * num_points;
size_t size_input = totalpoints_input * sizeof(float);
size_t size_output = totalpoints_output * sizeof(float);
float* d_input;
cutilSafeCall(cudaMalloc((void**)&d_input, size_input));
float* d_result;
cutilSafeCall(cudaMalloc((void**)&d_result, size_output));
// used to save dir, and calculate dir * dir'
float* d_dir2;
cutilSafeCall(cudaMalloc((void**)&d_dir2, dim * num_points * sizeof(float)));
// used to save R: dim * dim * N
float* d_R;
cutilSafeCall(cudaMalloc((void**)&d_R, size_output));
// used to save Stick: dim * dim * N
float* d_Stick;
cutilSafeCall(cudaMalloc((void**)&d_Stick, size_output));
// used to save Stick: dim * dim * N
float* d_PStick;
cutilSafeCall(cudaMalloc((void**)&d_PStick, size_output));
// Copy input data from host to device
cudaMemcpy(d_input, input, size_input, cudaMemcpyHostToDevice);
int totalblock = (totalpoints % BLOCKPOINTS==0 ? totalpoints/BLOCKPOINTS : (int(totalpoints/BLOCKPOINTS) + 1));
int gridx = (65535 < totalblock ? 65535 : totalblock);
int gridy = (totalblock % gridx == 0 ? totalblock/gridx : (int(totalblock/gridx)+1) );
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(gridx, gridy);
stickvote<<<dimGrid, dimBlock>>>(dim, num_points, gridx, Sigma, d_input, d_dir2, d_R, d_Stick, d_PStick, d_result);
cudaMemcpy(results, d_result, size_output, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_result);
cudaFree(d_dir2);
cudaFree(d_R);
cudaFree(d_Stick);
cudaFree(d_PStick);
}
The original poster of the question performed some further code simplification and debugging his/herself and discover that the guard statement in the kernel:
if (number >= dim * num_points)
return;
was, in fact, incorrect and should have been
if (number >= num_points)
return;
This was the source of the error.
This answer has been added as a community wiki answer with the intention of removing this question from the unanswered queue.