Here is the code I am using.
#define ANGLETORADIANS 0.017453292519943295769236907684886f // PI / 180
#define RADIANSTOANGLE 57.295779513082320876798154814105f // 180 / PI
rotation = rotation *ANGLETORADIANS;
cosRotation = cos(rotation);
sinRotation = sin(rotation);
for(int i = 0; i < 3; i++)
{
px[i] = (vec[i].x + centerX) * (cosRotation - (vec[i].y + centerY)) * sinRotation;
py[i] = (vec[i].x + centerX) * (sinRotation + (vec[i].y + centerY)) * cosRotation;
printf("num: %i, px: %f, py: %f\n", i, px[i], py[i]);
}
so far it seams my Y value is being fliped.. say I enter the value of X = 1 and Y = 1 with a 45 rotation you should see about x = 0 and y = 1.25 ish but I get x = 0 y = -1.25.
Also my 90 degree rotation always return x = 0 and y = 0.
p.s I know I'm only centering my values and not putting them back where they came from. It's not needed to put them back as all I need to know is the value I'm getting now.
Your bracket placement doesn't look right to me. I would expect:
px[i] = (vec[i].x + centerX) * cosRotation - (vec[i].y + centerY) * sinRotation;
py[i] = (vec[i].x + centerX) * sinRotation + (vec[i].y + centerY) * cosRotation;
Your brackets are wrong. It should be
px[i] = ((vec[i].x + centerX) * cosRotation) - ((vec[i].y + centerY) * sinRotation);
py[i] = ((vec[i].x + centerX) * sinRotation) + ((vec[i].y + centerY) * cosRotation);
instead
Related
My function getHeightOfTerrain() is calling a barycentric formula function that is not returning the correct height for the one set test height in : heightMapFromArray[][].
I've tried watching OpenGL JAVA Game tutorials 14,21, 22, by "thin matrix" and I am confused on how to use my array: heightMapforBaryCentric in both of the supplied functions, and how to set the arguments that are passed to the baryCentic() function in some sort of manner so that I can solve the problem.
int creaateTerrain(int height, int width)
{
float holderY[6] = { 0.f ,0.f,0.f,0.f,0.f,0.f };
float scaleit = 1.5f;
float holder[6] = { 0.f,0.f,0.f,0.f,0.f,0.f };
for (int z = 0, z2 =0; z < iterationofHeightMap;z2++)
{
//each loop is two iterations and creates one quad (two triangles)
//however because each iteration is by two (i.e. : x=x+2) om bottom
//the amount of triangles is half the x value
//
//number of vertices : 80 x 80 x 6.
//column
for (int x = 0, x2 = 0; x < iterationofHeightMap;x2++)
{
//relevant - A : first triangle - on left triangle
//[row] [colum[]
holder[0] = heightMapFromArray[z][x];
//holder[0] = (float)imageData[(z / 2 * MAP_Z + (x / 2)) * 3];
//holder[0] = holder[0] / 255;// *scaleit;
vertices.push_back(glm::vec3(x, holder[0], z));
//match height map with online barycentric use
heightMapforBaryCentric[x2][z2] = holder[0];
holder[1] = heightMapFromArray[z+2][x];
//holder[1] = (float)imageData[(((z + 2) / 2 * MAP_Z + ((x) / 2))) * 3];
//holder[1] = holder[1] / 255;// 6 * scaleit;
vertices.push_back(glm::vec3(x, holder[1], z + 2));
//match height map with online barycentric use
heightMapforBaryCentric[x2][z2+1] = holder[1];
holder[2] = heightMapFromArray[z+2][x+2];
//holder[2] = (float)imageData[(((z + 2) / 2 * MAP_Z + ((x + 2) / 2))) * 3];
//holder[2] = holder[2] / 255;// *scaleit;
vertices.push_back(glm::vec3(x + 2, holder[2], z + 2));
////match height map with online barycentric use
heightMapforBaryCentric[x2+1][z2+1] = holder[2];
//relevant - B - second triangle (on right side)
holder[3] = heightMapFromArray[z][x];
//holder[3] = (float)imageData[((z / 2)*MAP_Z + (x / 2)) * 3];
//holder[3] = holder[3] / 255;// 256 * scaleit;
vertices.push_back(glm::vec3(x, holder[3], z));
holder[4] = heightMapFromArray[x+2][z+2];
//holder[4] = (float)imageData[(((z + 2) / 2 * MAP_Z + ((x + 2) / 2))) * 3];
//holder[4] = holder[4] / 255;// *scaleit;
vertices.push_back(glm::vec3(x + 2, holder[4], z + 2));
holder[5] = heightMapFromArray[x+2][z];
//holder[5] = (float)imageData[((z / 2)*MAP_Z + ((x + 2) / 2)) * 3];
//holder[5] = holder[5] / 255;// *scaleit;
vertices.push_back(glm::vec3(x + 2, holder[5], z));
x = x + 2;
}
z = z + 2;
}
return(1);
}
float getHeightOfTerrain(float worldX, float worldZ) {
float terrainX = worldX;
float terrainZ = worldZ;
int gridSquareSize = 2.0f;
gridX = (int)floor(terrainX / gridSquareSize);
gridZ = (int)floor(terrainZ / gridSquareSize);
xCoord = ((float)(fmod(terrainX, gridSquareSize)) / (float)gridSquareSize);
zCoord = ((float)(fmod(terrainZ, gridSquareSize)) / (float)gridSquareSize);
if (xCoord <= (1 - zCoord))
{
answer = baryCentric(
//left triangle
glm::vec3(0.0f, heightMapforBaryCentric[gridX][gridZ], 0.0f),
glm::vec3(0.0f, heightMapforBaryCentric[gridX][gridZ+1], 1.0f),
glm::vec3(1.0f, heightMapforBaryCentric[gridX+1][gridZ+1], 1.0f),
glm::vec2(xCoord, zCoord));
// if (answer != 1)
// {
// fprintf(stderr, "Z:gridx: %d gridz: %d answer: %f\n", gridX, gridZ,answer);
//
// }
}
else
{
//right triangle
answer = baryCentric(glm::vec3(0, heightMapforBaryCentric[gridX][gridZ], 0),
glm::vec3(1,heightMapforBaryCentric[gridX+1][gridZ+1], 1),
glm::vec3(1,heightMapforBaryCentric[gridX+1][gridZ], 0),
glm::vec2(xCoord, zCoord));
}
if (answer == 1)
{
answer = 0;
}
//answer = abs(answer - 1);
return(answer);
}
float baryCentric(glm::vec3 p1, glm::vec3 p2, glm::vec3 p3 , glm::vec2 pos) {
float det = (p2.z - p3.z) * (p1.x - p3.x) + (p3.x - p2.x) * (p1.z - p3.z);
float l1 = ((p2.z - p3.z) * (pos.x - p3.x) + (p3.x - p2.x) * (pos.y - p3.z)) / det;
float l2 = ((p3.z - p1.z) * (pos.x - p3.x) + (p1.x - p3.x) * (pos.y - p3.z)) / det;
float l3 = 1.0f - l1 - l2;
return (l1 * p1.y + l2 * p2.y + l3 * p3.y);
}
My expected results were that the center of the test grid's height to be the set value .5 and gradually less as the heights declined. My results were the heights being all the same, varied, or increasing. Usually these heights were under the value of one.
What I try to achieve is to rotate a line around rectangle center so it always stays in its bounds touching them (or having some padding).
Now I have the following routine for this, as you see I use tan calculations dividing my rectangle into 8 parts (red lines)
It works so far, but for some reason I have inconsistency using other calculation for radius drawing (green line), the lines won't always match as expected and I wonder why.
Basically the same could be achieved using just sin/cos calculations and finding cross points between lines and rect borders, but for some reason I could not get it to work.
std::pair<Point, Point>
MathUtils::calculateRotatingLine(Size size, double degrees)
{
auto width = size.width;
auto height = size.height;
double diagonalAngleTopRight = radiansToDegrees(atan((width / 2) / (height / 2)));
double diagonalAngleBottomRight = 90 + (90 - diagonalAngleTopRight);
double diagonalAngleBottomLeft = 180 + diagonalAngleTopRight;
double diagonalAngleTopLeft = 180 + diagonalAngleBottomRight;
double x, y;
/*
* *8*1*
* 7* *2
* 6* *3
* *5*4*
*/
// 1
if (degrees >= 0 && degrees <= diagonalAngleTopRight) {
x = width / 2 + height / 2 * tan(degreesToRadians(degrees));
y = 0;
}
// 2
else if (degrees > diagonalAngleTopRight && degrees <= 90) {
x = width;
y = width / 2 * tan(degreesToRadians(degrees - diagonalAngleTopRight));
}
// 3
else if (degrees > 90 && degrees <= diagonalAngleBottomRight) {
x = width;
y = height / 2 + width / 2 * tan(degreesToRadians(degrees - 90));
}
// 4
else if (degrees > diagonalAngleBottomRight && degrees <= 180) {
x = width - height / 2 * tan(degreesToRadians(degrees - diagonalAngleBottomRight));
y = height;
}
// 5
else if (degrees > 180 && degrees <= diagonalAngleBottomLeft) {
x = width / 2 - height / 2 * tan(degreesToRadians(degrees - 180));
y = height;
}
// 6
else if (degrees > diagonalAngleBottomLeft && degrees <= 270) {
x = 0;
y = height - width / 2 * tan(degreesToRadians(degrees - diagonalAngleBottomLeft));
}
// 7
else if (degrees > 270 && degrees <= diagonalAngleTopLeft) {
x = 0;
y = height / 2 - width / 2 * tan(degreesToRadians(degrees - 270));
}
// 8
else {
x = height / 2 * tan(degreesToRadians(degrees - diagonalAngleTopLeft));
y = 0;
}
return {Point{width / 2, height / 2}, Point{x, y}};
}
Green line calculation
Point
MathUtils::calculateCirclePoint(double radius, double degrees)
{
return {radius * cos(degreesToRadians(degrees)), radius * sin(degreesToRadians(degrees))};
}
EDIT
Awesome, it works thanks to #MBo
Point
MathUtils::calculateCrossPoint(Size size, double degrees)
{
auto x0 = size.width / 2;
auto y0 = size.height / 2;
auto vx = cos(degreesToRadians(degrees - 90));
auto vy = sin(degreesToRadians(degrees - 90));
//potential border positions
auto ex = vx > 0 ? size.width : 0;
auto ey = vy > 0 ? size.height : 0;
//check for horizontal/vertical directions
if (vx == 0) {
return {x0, ey};
}
if (vy == 0) {
return {ex, y0};
}
// in general case find times of intersections with horizontal and vertical edge line
auto tx = (ex - x0) / vx;
auto ty = (ey - y0) / vy;
// and get intersection for smaller parameter value
if (tx <= ty) {
return {ex, y0 + tx * vy};
}
return {x0 + ty * vx, ey};
}
Pseudocode to find intersection of ray emitted from rectangle center (with angle an in radians) with edges. (Works also for other (x0,y0) positions)
x0 = width / 2;
y0 = height / 2;
vx = cos(an);
vy = sin(an);
//potential border positions
ex = vx > 0? width: 0
ey = vy > 0? height: 0
//check for horizontal/vertical directions
if vx = 0 then
return cx = x0, cy = ey
if vy = 0 then
return cx = ex, cy = y0
//in general case find times of intersections with horizontal and vertical edge line
tx = (ex - x0) / vx
ty = (ey - y0) / vy
//and get intersection for smaller parameter value
if tx <= ty then
return cx = ex, cy = y0 + tx * vy
else
return cx = x0 + ty * vx, cy = ey
Here is my sobel filter function performed on a grayscale image. Apparently I'm not doing my calculations correct because I keep getting an all black image. I have already turned in the project but it is bothering me that the results aren't right.
int sobelH[3][3] = { -1, 0, 1,
-2, 0, 2,
-1, 0, 1 },
sobelV[3][3] = { 1, 2, 1,
0, 0, 0,
-1, -2, -1 };
//variable declaration
int mag;
int pix_x, pix_y = 0;
int img_x, img_y;
for (img_x = 0; img_x < img->x; img_x++)
{
for (img_y = 0; img_y < img->y; img_y++)
{
pix_x = 0;
pix_y = 0;
//calculating the X and Y convolutions
for (int i = -1; i <= 1; i++)
{
for (int j = -1; j <= 1; j++)
{
pix_x += (img->data[img_y * img->x + img_x].red + img->data[img_y * img->x + img_x].green + img->data[img_y * img->x + img_x].blue) * sobelH[1 + i][1 + j];
pix_y += (img->data[img_y * img->x + img_x].red + img->data[img_y * img->x + img_x].green + img->data[img_y * img->x + img_x].blue) * sobelV[1 + i][1 + j];
}
}
//Gradient magnitude
mag = sqrt((pix_x * pix_x) + (pix_y * pix_y));
if (mag > RGB_COMPONENT_COLOR)
mag = 255;
if (mag < 0)
mag = 0;
//Setting the new pixel value
img->data[img_y * img->x + img_x].red = mag;
img->data[img_y * img->x + img_x].green = mag;
img->data[img_y * img->x + img_x].blue = mag;
}
}
Although your code could use some improvement, the main reason is that you compute the convolution at constant img_y and img_x. What you need to do is:
pix_x += (img->data[img_y * img->x + img_x + i].red + img->data[img_y * img->x + img_x + i].green + img->data[img_y * img->x + img_x + i].blue) * sobelH[1 + i][1 + j];
Indeed, the Sobel convolution is symmetric, so if you compute the convolution with a constant image, it will result in only black.
Note that in the above example I do not take into account the border of the image. You should make sure to not access pixels that are outside your pixel array.
Another mistake is that you're writing in the input image. You write at location (x,y), then compute the filter result for location (x+1,y) using the modified value at (x,y), which is the wrong value to use.
You need to write your result to a new image.
In the process of writing a program for processing digital images, I wrote a CUDA kernel that runs slowly. The code is given below:
__global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) {
float G, h;
float fx[3];
float fy[3];
float g[2][2];
float k10 = 0.0;
float k11 = 0.0;
float k12 = 0.0;
float k20 = 0.0;
float k21 = 0.0;
float k22 = 0.0;
float k30 = 0.0;
float k31 = 0.0;
float k32 = 0.0;
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2))
{
int idx0 = yIndex * width + xIndex;
if (inputArray[idx0] > 0)
{
for (int i = 0; i < kc2; i++)
{
for (int j = 0; j < kc2; j++)
{
int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2);
float3 rgb = col_image[idx1];
k10 = k10 + constMat1[i * kc2 + j] * rgb.x;
k11 = k11 + constMat1[i * kc2 + j] * rgb.y;
k12 = k12 + constMat1[i * kc2 + j] * rgb.z;
k20 = k20 + constMat2[i * kc2 + j] * rgb.x;
k21 = k21 + constMat2[i * kc2 + j] * rgb.y;
k22 = k22 + constMat2[i * kc2 + j] * rgb.z;
k30 = k30 + constMat3[i * kc2 + j] * rgb.x;
k31 = k31 + constMat3[i * kc2 + j] * rgb.y;
k32 = k32 + constMat3[i * kc2 + j] * rgb.z;
}
}
fx[0] = kc2 * (k30 - k20);
fx[1] = kc2 * (k31 - k21);
fx[2] = kc2 * (k32 - k22);
fy[0] = kc2 * (k10 - k20);
fy[1] = kc2 * (k11 - k21);
fy[2] = kc2 * (k12 - k22);
g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2];
g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2];
g[1][0] = g[0][1];
g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]
G = g[0][0] * g[1][1] - g[0][1] * g[1][0];
h = g[0][0] + g[1][1];
// Output
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2);
outputArray[idx2] = (h * h) / G;
}
}
}
Here some (non-negative) values of inputArray are processed. The array col-image contains color components in the RGB model. If the value of inputArray satisfies the condition, then we compute the special coefficients k_{ij} in a neighborhood of kc2 on kc2 with center at the considered point (the value of kc2 is either 3 or 5). The values of constMat[1,2,3] are stored in the device's constant memory:
__device__ __constant__ float constMat[];
Then we calculate the characteristics fx, fy, g_{ij}, h, G and write the resulting value in the corresponding cell of outputArray.
Importantly, all the data specified is stored in global memory, and the fact that the input array can be large enough (about 40 million points). All this directly affects the speed of the kernel.
How do we speed up the execution of this kernel (any techniques are welcome: use of shared memory / textures, use of stencil templates, etc.)?
What I would call a "standard" usage of shared memory to buffer a block of col_image for use (and reuse) by the threadblock would be a "standard" suggestion here.
According to my tests, it seems to offer a substantial improvement. Since you have not provided a complete code, or any sort of data set or results verification, I will skip all those also. What follows then is a not-really-tested implementation of shared memory into your existing code, to "buffer" a (threadblockwidth + kc2)*(threadblockheight+kc2) "patch" of the col_image input data into a shared memory buffer. Thereafter, during the double-nested for-loops, the data is read out of the shared memory buffer.
A 2D shared memory stencil operation like this is an exercise in indexing as well as an exercise in handling edge cases. Your code is somewhat simpler in that we need only consider the edges to the "right" and "downward" when considering the needed "halo" of data to be buffered into shared memory.
I have not attempted to verify that this code is perfect. However it should give you a "roadmap" for how to implement a 2D shared memory buffer system, with some motivation for the effort: I witness about a ~5x speedup by doing so, although YMMV, and its entirely possible I've made a performance mistake.
Here's a worked example, showing the speedup on Pascal Titan X, CUDA 8.0.61, Linux:
$ cat t390.cu
#include <stdio.h>
#include <iostream>
const int adim = 6000;
const int KC2 = 5;
const int thx = 32;
const int thy = 32;
__constant__ float constMat1[KC2*KC2];
__constant__ float constMat2[KC2*KC2];
__constant__ float constMat3[KC2*KC2];
__global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) {
float G, h;
float fx[3];
float fy[3];
float g[2][2];
float k10 = 0.0;
float k11 = 0.0;
float k12 = 0.0;
float k20 = 0.0;
float k21 = 0.0;
float k22 = 0.0;
float k30 = 0.0;
float k31 = 0.0;
float k32 = 0.0;
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int idx0 = yIndex * width + xIndex;
#ifdef USE_SHARED
__shared__ float3 s_col_image[thy+KC2][thx+KC2];
int idx = xIndex;
int idy = yIndex;
int DATAHSIZE= height;
int WSIZE = kc2;
int DATAWSIZE = width;
float3 *input = col_image;
int BLKWSIZE = thx;
int BLKHSIZE = thy;
if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE+WSIZE))
s_col_image[threadIdx.y][threadIdx.x]=input[idx0];
if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE) && (threadIdx.y > BLKWSIZE - WSIZE))
s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x] = input[idx0+(WSIZE-1)*width];
if ((idx < DATAHSIZE) && (idy < DATAWSIZE+WSIZE) && (threadIdx.x > BLKHSIZE - WSIZE))
s_col_image[threadIdx.y][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)];
if ((idx < DATAHSIZE) && (idy < DATAWSIZE) && (threadIdx.x > BLKHSIZE - WSIZE) && (threadIdx.y > BLKWSIZE - WSIZE))
s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)*width + (WSIZE-1)];
__syncthreads();
#endif
if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2))
{
if (inputArray[idx0] > 0)
{
for (int i = 0; i < kc2; i++)
{
for (int j = 0; j < kc2; j++)
{
#ifdef USE_SHARED
float3 rgb = s_col_image[threadIdx.y][threadIdx.x];
#else
int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2);
float3 rgb = col_image[idx1];
#endif
k10 = k10 + constMat1[i * kc2 + j] * rgb.x;
k11 = k11 + constMat1[i * kc2 + j] * rgb.y;
k12 = k12 + constMat1[i * kc2 + j] * rgb.z;
k20 = k20 + constMat2[i * kc2 + j] * rgb.x;
k21 = k21 + constMat2[i * kc2 + j] * rgb.y;
k22 = k22 + constMat2[i * kc2 + j] * rgb.z;
k30 = k30 + constMat3[i * kc2 + j] * rgb.x;
k31 = k31 + constMat3[i * kc2 + j] * rgb.y;
k32 = k32 + constMat3[i * kc2 + j] * rgb.z;
}
}
fx[0] = kc2 * (k30 - k20);
fx[1] = kc2 * (k31 - k21);
fx[2] = kc2 * (k32 - k22);
fy[0] = kc2 * (k10 - k20);
fy[1] = kc2 * (k11 - k21);
fy[2] = kc2 * (k12 - k22);
g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2];
g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2];
g[1][0] = g[0][1];
g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]; // had a missing semicolon
G = g[0][0] * g[1][1] - g[0][1] * g[1][0];
h = g[0][0] + g[1][1];
// Output
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2); // possible indexing bug here
outputArray[idx2] = (h * h) / G;
}
}
}
int main(){
int *d_inputArray;
int height = adim;
int width = adim;
float *d_outputArray;
float3 *d_col_image;
int kc2 = KC2;
cudaMalloc(&d_inputArray, height*width*sizeof(int));
cudaMemset(d_inputArray, 1, height*width*sizeof(int));
cudaMalloc(&d_col_image, (height+kc2)*(width+kc2)*sizeof(float3));
cudaMalloc(&d_outputArray, height*width*sizeof(float));
dim3 threads(thx,thy);
dim3 blocks((adim+threads.x-1)/threads.x, (adim+threads.y-1)/threads.y);
Kernel<<<blocks,threads>>>( d_inputArray, d_outputArray, d_col_image, height, width, kc2 );
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t390 t390.cu
$ cuda-memcheck ./t390
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t390
==1473== NVPROF is profiling process 1473, command: ./t390
==1473== Profiling application: ./t390
==1473== Profiling result:
Time(%) Time Calls Avg Min Max Name
97.29% 34.705ms 1 34.705ms 34.705ms 34.705ms Kernel(int*, float*, float3*, int, int, int)
2.71% 965.14us 1 965.14us 965.14us 965.14us [CUDA memset]
==1473== API calls:
Time(%) Time Calls Avg Min Max Name
88.29% 310.69ms 3 103.56ms 550.23us 309.46ms cudaMalloc
9.86% 34.712ms 1 34.712ms 34.712ms 34.712ms cudaDeviceSynchronize
1.05% 3.6801ms 364 10.110us 247ns 453.59us cuDeviceGetAttribute
0.70% 2.4483ms 4 612.07us 547.62us 682.25us cuDeviceTotalMem
0.08% 284.32us 4 71.079us 63.098us 79.616us cuDeviceGetName
0.01% 29.533us 1 29.533us 29.533us 29.533us cudaMemset
0.01% 21.189us 1 21.189us 21.189us 21.189us cudaLaunch
0.00% 5.2730us 12 439ns 253ns 1.1660us cuDeviceGet
0.00% 3.4710us 6 578ns 147ns 2.4820us cudaSetupArgument
0.00% 3.1090us 3 1.0360us 340ns 2.1660us cuDeviceGetCount
0.00% 1.0370us 1 1.0370us 1.0370us 1.0370us cudaConfigureCall
ubuntu#titanxp-DiGiTS-Dev-Box:~/bobc/misc$ nvcc -arch=sm_61 -o t390 t390.cu -DUSE_SHARED
ubuntu#titanxp-DiGiTS-Dev-Box:~/bobc/misc$ cuda-memcheck ./t390
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t390
==1545== NVPROF is profiling process 1545, command: ./t390
==1545== Profiling application: ./t390
==1545== Profiling result:
Time(%) Time Calls Avg Min Max Name
86.17% 5.4181ms 1 5.4181ms 5.4181ms 5.4181ms Kernel(int*, float*, float3*, int, int, int)
13.83% 869.94us 1 869.94us 869.94us 869.94us [CUDA memset]
==1545== API calls:
Time(%) Time Calls Avg Min Max Name
96.13% 297.15ms 3 99.050ms 555.80us 295.90ms cudaMalloc
1.76% 5.4281ms 1 5.4281ms 5.4281ms 5.4281ms cudaDeviceSynchronize
1.15% 3.5664ms 364 9.7970us 247ns 435.92us cuDeviceGetAttribute
0.86% 2.6475ms 4 661.88us 642.85us 682.42us cuDeviceTotalMem
0.09% 266.42us 4 66.603us 62.005us 77.380us cuDeviceGetName
0.01% 29.624us 1 29.624us 29.624us 29.624us cudaMemset
0.01% 19.147us 1 19.147us 19.147us 19.147us cudaLaunch
0.00% 4.8560us 12 404ns 248ns 988ns cuDeviceGet
0.00% 3.3390us 6 556ns 134ns 2.3510us cudaSetupArgument
0.00% 3.1190us 3 1.0390us 331ns 2.0780us cuDeviceGetCount
0.00% 1.1940us 1 1.1940us 1.1940us 1.1940us cudaConfigureCall
$
We see that the kernel execution time is ~35ms in the non-shared case, and ~5.5ms in the shared case. For this case I set kc2=5. For the kc2=3 case, performance gains will be less.
A few notes:
Your posted code was missing a semicolon on one line. I've added that and marked the line in my code.
I suspect you may have an indexing error on the "output" write to outputArray. Your indexing is like this:
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2);
whereas I would have expected this:
int idx2 = (yIndex - kc2/2) * width + (xIndex - kc2/2);
however I haven't thought carefully about it, so I may be wrong here.
In the future, if you want help with a problem like this, I'd advise you to at least provide the level of complete code scaffolding and description that I have. Provide a complete code that somebody else could immediately pick up and test, without having to write their own code. Also define what platform you are on and what your performance measurement is.
I have been playing around with trying to draw a 320 by 240 full screen resolution image in opengl using java and lwjgl. I set the resolution to 640 by 480 and doubled the size of the pixels to fill in the space. After a lot of google searching I found some information about using the glDrawPixels function to speed up drawing to the screen. I wanted to test it by assigning random colors to all the pixels on the screen, but it wouldn't fill the screen. I divided the width into 4 sections of 80 pixels each and colored them red, green, blue, and white. I saw that I was interleaving the colors but I can't figure out how.
Here is an image of the output:
Here is where I run the openGL code:
// init OpenGL
GL11.glMatrixMode(GL11.GL_PROJECTION);
GL11.glLoadIdentity();
GL11.glOrtho(0, 640, 0, 480, 1, -1);
GL11.glMatrixMode(GL11.GL_MODELVIEW);
while (!Display.isCloseRequested()) {
pollInput();
// Clear the screen and depth buffer
GL11.glClear(GL11.GL_COLOR_BUFFER_BIT | GL11.GL_DEPTH_BUFFER_BIT);
randomizePixels();
GL11.glRasterPos2i(0, 0);
GL11.glDrawPixels(320, 240,GL11.GL_RGBA, GL11.GL_UNSIGNED_BYTE,buff);
GL11.glPixelZoom(2, 2);
Display.update();
}
Display.destroy();
}
and here is where I create the pixel color data:
public void randomizePixels(){
for(int y = 0; y < 240; y++){
for(int x = 0; x < 320; x+=4){
/*
pixels[x * 320 + y] = (byte)(-128 + ran.nextInt(256));
pixels[x * 320 + y + 1] = (byte)(-128 + ran.nextInt(256));
pixels[x * 320 + y + 2] = (byte)(-128 + ran.nextInt(256));
pixels[x * 320 + y + 3] = (byte)(-128 + ran.nextInt(256));
*/
if(x >= 0 && x < 80){
pixels[y * 240 + x] = (byte)128;
pixels[y * 240 + x + 1] = (byte)0;
pixels[y * 240 + x + 2] = (byte)0;
pixels[y * 240 + x + 3] = (byte)128;
}else if(x >= 80 && x < 160){
pixels[y * 240 + x] = (byte)0;
pixels[y * 240 + x + 1] = (byte)128;
pixels[y * 240 + x + 2] = (byte)0;
pixels[y * 240 + x + 3] = (byte)128;
}else if(x >= 160 && x < 240){
pixels[y * 240 + x] = (byte)0;
pixels[y * 240 + x + 1] = (byte)0;
pixels[y * 240 + x + 2] = (byte)128;
pixels[y * 240 + x + 3] = (byte)128;
}else if(x >= 240 && x < 320){
pixels[y * 240 + x] = (byte)128;
pixels[y * 240 + x + 1] = (byte)128;
pixels[y * 240 + x + 2] = (byte)128;
pixels[y * 240 + x + 3] = (byte)128;
}
}
}
buff.put(pixels).flip();
}
If you can figure out why I can't get the pixels to line up to the x and y coordinates I want them to go to that would be great. I have read that glDrawPixels probably isn't the best or fastest way to draw pixels to the screen, but I want to understand why I'm having this particular issue before I have to move on to some other method.
Just load your image (unscaled) into a texture and draw a textured quad.
Don't use glDrawPixels. This function was never properly optimized in most drivers and has was deprecated since OpenGL-2 and got removed from OpenGL-3 core and later.
I spot 2 issues in your randomizePixels().
1. Indexing Pixel Buffer
The total size of pixel buffer is 320x240x4 bytes because the pixel type is GL_RGBA. So, indexing each pixel with subscript operator, [], it would be;
for(int y = 0; y < 240; y++)
{
for(int x = 0; x < 320; x++)
{
pixels[y * 320 * 4 + x * 4 + 0] = ... // R
pixels[y * 320 * 4 + x * 4 + 1] = ... // G
pixels[y * 320 * 4 + x * 4 + 2] = ... // B
pixels[y * 320 * 4 + x * 4 + 3] = ... // A
}
}
2. Colour Value
The max intensity of 8bit colour is 255, for example, an opaque red pixel would be (255, 0, 0, 255).
your operating on the texture. better do it on quadrature. it would yield good results