Compute gradient for voxel data efficiently - c++

What is the most efficient way of computing the gradient for fixed sized voxel data, such as the source code below. Note that I need the gradient at any point in space. The gradients will be used for estimating normals in a marching cubes implementation.
#import <array>
struct VoxelData {
VoxelData(float* data, unsigned int xDim, unsigned int yDim, unsigned int zDim)
:data(data), xDim(xDim), yDim(yDim), zDim(zDim)
{}
std::array<float,3> get_gradient(float x, float y, float z){
std::array<float,3> res;
// compute gradient efficiently
return res;
}
float get_density(int x, int y, int z){
if (x<0 || y<0 || z<0 || x >= xDim || y >= yDim || z >= zDim){
return 0;
}
return data[get_element_index(x, y, z)];
}
int get_element_index(int x, int y, int z){
return x * zDim * yDim + y*zDim + z;
}
const float* const data;
const unsigned int xDim;
const unsigned int yDim;
const unsigned int zDim;
};
Update 1
A demo project of the problem can be found here:
https://github.com/mortennobel/OpenGLVoxelizer
Currently the output is like the picture below (based on MooseBoys code):
Update 2
The solution that I'm looking for must give fairly accurate gradients, since they are used as normals in a visualisation and visual artefacts like the ones below must be avoided.
Update 2
Solution from the user example is:

The following produces a linearly interpolated gradient field:
std::array<float,3> get_gradient(float x, float y, float z){
std::array<float,3> res;
// x
int xi = (int)(x + 0.5f);
float xf = x + 0.5f - xi;
float xd0 = get_density(xi - 1, (int)y, (int)z);
float xd1 = get_density(xi, (int)y, (int)z);
float xd2 = get_density(xi + 1, (int)y, (int)z);
res[0] = (xd1 - xd0) * (1.0f - xf) + (xd2 - xd1) * xf; // lerp
// y
int yi = (int)(y + 0.5f);
float yf = y + 0.5f - yi;
float yd0 = get_density((int)x, yi - 1, (int)z);
float yd1 = get_density((int)x, yi, (int)z);
float yd2 = get_density((int)x, yi + 1, (int)z);
res[1] = (yd1 - yd0) * (1.0f - yf) + (yd2 - yd1) * yf; // lerp
// z
int zi = (int)(z + 0.5f);
float zf = z + 0.5f - zi;
float zd0 = get_density((int)x, (int)y, zi - 1);
float zd1 = get_density((int)x, (int)y, zi);
float zd2 = get_density((int)x, (int)y, zi + 1);
res[2] = (zd1 - zd0) * (1.0f - zf) + (zd2 - zd1) * zf; // lerp
return res;
}

One important technique for optimization in many implementations involves time/space trade off. As a suggestion, anywhere you can pre-calc and cache your results may be worth looking at.

In general Sobel filters provide slightly nicer results than simple central tendency, but take longer to compute (the Sobel is essentially a smooth filter combined with central tendency). A classic Sobel requires weighting 26 samples, while central tendency only requires 6. However, there is a trick: with GPUs you can get hardware-based trilinear interpolation for free. That means you can compute a Sobel with 8 texture reads, and this can be done in parallel across the GPU. The following page illustrates this technique using GLSL
http://www.mccauslandcenter.sc.edu/mricrogl/notes/gradients
For your project you would probably want to compute the gradients on the GPU and use GPGPU methods to copy the results back from the GPU to the CPU for further processing.

MooseBoys already posted a component-wise linear interpolation. It is discontinuous in the y and z component though, whereever (int)x changes from one value to the next (same thing for the other components). This might cause such a rough picture as you are seeing it. If you have enough performance to spare you can improve this by considering not just (int)x but (int)(x+1) aswell. This might look like the following
std::array<float,3> get_gradient(float x, float y, float z){
std::array<float,3> res;
int xim = (int)(x + 0.5f);
float xfm = x + 0.5f - xi;
int yim = (int)(y + 0.5f);
float yfm = y + 0.5f - yi;
int zim = (int)(z + 0.5f);
float zfm = z + 0.5f - zi;
int xi = (int)x;
float xf = x - xi;
int yi = (int)y;
float yf = y - yi;
int zi = (int)z;
float zf = z - zi;
float xd0 = yf*( zf *get_density(xim - 1, yi+1, zi+1)
+ (1.0f - zf)*get_density(xim - 1, yi+1, zi))
+(1.0f - yf)*(zf *get_density(xim - 1, yi , zi+1)
+ (1.0f - zf)*get_density(xim - 1, yi , zi));
float xd1 = yf*( zf *get_density(xim, yi+1, zi+1)
+ (1.0f - zf)*get_density(xim, yi+1, zi))
+(1.0f - yf)*(zf *get_density(xim, yi , zi+1)
+ (1.0f - zf)*get_density(xim, yi , zi));
float xd2 = yf*( zf *get_density(xim + 1, yi+1, zi+1)
+ (1.0f - zf)*get_density(xim + 1, yi+1, zi))
+(1.0f - yf)*(zf *get_density(xim + 1, yi , zi+1)
+ (1.0f - zf)*get_density(xim + 1, yi , zi));
res[0] = (xd1 - xd0) * (1.0f - xfm) + (xd2 - xd1) * xfm;
float yd0 = xf*( zf *get_density(xi+1, yim-1, zi+1)
+ (1.0f - zf)*get_density(xi+1, yim-1, zi))
+(1.0f - xf)*(zf *get_density(xi , yim-1, zi+1)
+ (1.0f - zf)*get_density(xi , yim-1, zi));
float yd1 = xf*( zf *get_density(xi+1, yim , zi+1)
+ (1.0f - zf)*get_density(xi+1, yim , zi))
+(1.0f - xf)*(zf *get_density(xi , yim , zi+1)
+ (1.0f - zf)*get_density(xi , yim , zi));
float yd2 = xf*( zf *get_density(xi+1, yim+1, zi+1)
+ (1.0f - zf)*get_density(xi+1, yim+1, zi))
+(1.0f - xf)*(zf *get_density(xi , yim+1, zi+1)
+ (1.0f - zf)*get_density(xi , yim+1, zi));
res[1] = (yd1 - yd0) * (1.0f - yfm) + (yd2 - yd1) * yfm;
float zd0 = xf*( yf *get_density(xi+1, yi+1, zim-1)
+ (1.0f - yf)*get_density(xi+1, yi , zim-1))
+(1.0f - xf)*(yf *get_density(xi, yi+1, zim-1)
+ (1.0f - yf)*get_density(xi, yi , zim-1));
float zd1 = xf*( yf *get_density(xi+1, yi+1, zim)
+ (1.0f - yf)*get_density(xi+1, yi , zim))
+(1.0f - xf)*(yf *get_density(xi, yi+1, zim)
+ (1.0f - yf)*get_density(xi, yi , zim));
float zd2 = xf*( yf *get_density(xi+1, yi+1, zim+1)
+ (1.0f - yf)*get_density(xi+1, yi , zim+1))
+(1.0f - xf)*(yf *get_density(xi, yi+1, zim+1)
+ (1.0f - yf)*get_density(xi, yi , zim+1));
res[2] = (zd1 - zd0) * (1.0f - zfm) + (zd2 - zd1) * zfm;
return res;
}
This can probably be written a bit more concise, but maybe this way you can still see what is happening. If this still is not smooth enough for you will have to look into cubic / spline interpolation or similar.

Related

How does this lighting calculation work?

I have that piece of code that is responsible for lighting a pyramid.
float Geometric3D::calculateLight(int vert1, int vert2, int vert3) {
float ax = tabX[vert2] - tabX[vert1];
float ay = tabY[vert2] - tabY[vert1];
float az = tabZ[vert2] - tabZ[vert1];
float bx = tabX[vert3] - tabX[vert1];
float by = tabY[vert3] - tabY[vert1];
float bz = tabZ[vert3] - tabZ[vert1];
float Nx = (ay * bz) - (az * by);
float Ny = (az * bx) - (ax * bz);;
float Nz = (ax * by) - (ay * bx);;
float Lx = -300.0f;
float Ly = -300.0f;
float Lz = -1000.0f;
float lenN = sqrtf((Nx * Nx) + (Ny * Ny) + (Nz * Nz));
float lenL = sqrtf((Lx * Lx) + (Ly * Ly) + (Lz * Lz));
float res = ((Nx * Lx) + (Ny * Ly) + (Nz * Lz)) / (lenN * lenL);
if (res < 0.0f)
res = -res;
return res;
}
I cannot understand calculations at the end. Can someone explain me the maths that is behind them? I know that firstly program calculates two vectors of a plane to compute the normal of it (which goes for vector N). Vector L stand for lighting but what happens next? Why do we calculate length of normal and light then multiply it and divide by their sizes?

C++ Angles between a vector and a point

I got 2 points own=(x, y, z) and en=(x, y, z) which represents my own position in the world and some other player position. the other player also got pitch (from 90 degrees to -90) and yaw (0 to 360). I want to calculate the angles between the other player look and my own position.
In 2D, alpha is what I'm trying to calculate:
int main()
{
float own_x = 1, own_y = 1, own_z = 1;
float en_x = 10, en_y = 1, en_z = 10;
float pi = 3.14159265;
float pitch = 0.f * (pi / 180), yaw = 45.f * (pi / 180);
float x = sin(yaw) * cos(pitch);
float y = sin(pitch);
float z = cos(pitch) * cos(yaw);
float vec_length = sqrt(pow(en_x - own_x, 2) + pow(en_y - own_y, 2) + pow(en_y - own_y, 2));
x /= vec_length;
y /= vec_length;
z /= vec_length;
float cos_t = ((en_x - own_x)*x + (en_y - own_y)*y + (en_z - own_z)*z) / sqrt(pow(en_x - own_x, 2) + pow(en_y - own_y, 2) + pow(en_y - own_y, 2));
float arc = acos(cos_t) * (180 / pi);
return 0;
}
you divide twice with the length of en-own: You should remove
vec_length, and xyz /= vec_length.
your division at cos_t is buggy, you use _y twice in the
expression instead of _y and _z
Note: instead of pow(x, 2), use x*x, it is faster usually (compilers may not optimize pow(x, 2) to x*x).

Half of my ellipse drawn in the wrong place

Here is the code for an oval drawing method I am working on. I am applying the Bresenham method to plot its co-ordinates, and taking advantage of the ellipse's symmetrical properties to draw the same pixel in four different places.
void cRenderClass::plotEllipse(int xCentre, int yCentre, int width, int height, float angle, float xScale, float yScale)
{
if ((height == width) && (abs(xScale - yScale) < 0.005))
plotCircle(xCentre, yCentre, width, xScale);
std::vector<std::vector <float>> rotate;
if (angle > 360.0f)
{
angle -= 180.0f;
}
rotate = maths.rotateMatrix(angle, 'z');
//rotate[0][0] = cos(angle)
//rotate[0][1] = sin(angle)
float theta = atan2(-height*rotate[0][1], width*rotate[0][0]);
if (angle > 90.0f && angle < 180.0f)
{
theta += PI;
}
//add scalation in at a later date
float xShear = (width * (cos(theta) * rotate[0][0])) - (height * (sin(theta) * rotate[0][1]));
float yShear = (width * (cos(theta) * rotate[0][1])) + (height * (sin(theta) * rotate[0][0]));
float widthAxis = abs(sqrt(((rotate[0][0] * width) * (rotate[0][0] * width)) + ((rotate[0][1] * height) * (rotate[0][1] * height))));
float heightAxis = (width * height) / widthAxis;
int aSquared = widthAxis * widthAxis;
int fourASquared = 4*aSquared;
int bSquared = heightAxis * heightAxis;
int fourBSquared = 4*bSquared;
x0 = 0;
y0 = heightAxis;
int sigma = (bSquared * 2) + (aSquared * (1 - (2 * heightAxis)));
while ((bSquared * x0) <= (aSquared * y0))
{
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
if (sigma >= 0)
{
sigma += (fourASquared * (1 - y0));
y0--;
}
sigma += (bSquared * ((4 * x0) + 6));
x0++;
}
x0 = widthAxis;
y0 = 0;
sigma = (aSquared * 2) + (bSquared * (1 - (2 * widthAxis)));
while ((aSquared * y0) <= (bSquared * x0))
{
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
if (sigma >= 0)
{
sigma += (fourBSquared * (1 - x0));
x0--;
}
sigma += (aSquared * (4 * y0) + 6);
y0++;
}
//the above algorithm hasn't been quite completed
//there are still a few things I want to enquire Andy about
//before I move on
//this other algorithm definitely works
//however
//it is computationally expensive
//and the line drawing isn't as refined as the first one
//only use this as a last resort
/* std::vector<std::vector <float>> rotate;
rotate = maths.rotateMatrix(angle, 'z');
float s = rotate[0][1];
float c = rotate[0][0];
float ratio = (float)height / (float)width;
float px, py, xNew, yNew;
for (int theta = 0; theta <= 360; theta++)
{
px = (xCentre + (cos(maths.degToRad(theta)) * (width / 2))) - xCentre;
py = (yCentre - (ratio * (sin(maths.degToRad(theta)) * (width / 2)))) - yCentre;
x0 = (px * c) - (py * s);
y0 = (px * s) + (py * c);
drawPixel(x0 + xCentre, y0 + yCentre);
}*/
}
Here's the problem. When testing the rotation matrix on my oval drawing function, I expect it to draw an ellipse at a slant from its original horizontal position as signified by 'angle'. Instead, it makes a heart shape. This is sweet, but not the result I want.
I have managed to get the other algorithm (as seen in the bottom part of that code sample) working successfully, but it takes more time to compute, and doesn't draw lines quite as nicely. I only plan to use that if I can't get this Bresenham one working.
Can anyone help?

Faster quaternion vector multiplication doesn't work

I need a faster quaternion-vector multiplication routine for my math library. Right now I'm using the canonical v' = qv(q^-1), which produces the same result as multiplying the vector by a matrix made from the quaternion, so I'm confident in it's correctness.
So far I've implemented 3 alternative "faster" methods:
#1, I have no idea where I got this one from:
v' = (q.xyz * 2 * dot(q.xyz, v)) + (v * (q.w*q.w - dot(q.xyz, q.zyx))) + (cross(q.xyz, v) * q.w * w)
Implemented as:
vec3 rotateVector(const quat& q, const vec3& v)
{
vec3 u(q.x, q.y, q.z);
float s = q.w;
return vec3(u * 2.0f * vec3::dot(u, v))
+ (v * (s*s - vec3::dot(u, u)))
+ (vec3::cross(u, v) * s * 2.0f);
}
#2, courtesy of this fine blog
t = 2 * cross(q.xyz, v);
v' = v + q.w * t + cross(q.xyz, t);
Implemented as:
__m128 rotateVector(__m128 q, __m128 v)
{
__m128 temp = _mm_mul_ps(vec4::cross(q, v), _mm_set1_ps(2.0f));
return _mm_add_ps(
_mm_add_ps(v, _mm_mul_ps(_mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)), temp)),
vec4::cross(q, temp));
}
And #3, from numerous sources,
v' = v + 2.0 * cross(cross(v, q.xyz) + q.w * v, q.xyz);
implemented as:
__m128 rotateVector(__m128 q, __m128 v)
{
//return v + 2.0 * cross(cross(v, q.xyz) + q.w * v, q.xyz);
return _mm_add_ps(v,
_mm_mul_ps(_mm_set1_ps(2.0f),
vec4::cross(
_mm_add_ps(
_mm_mul_ps(_mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)), v),
vec4::cross(v, q)),
q)));
}
All 3 of these produce incorrect results. I have, however, noticed some interesting patterns. First of all, #1 and #2 produce the same result. #3 produces the same result that I get from multiplying the vector by a derived matrix if said matrix is transposed (I discovered this by accident, previously my quat-to-matrix code assumed row-major matrices, which was incorrect).
The data storage of my quaternions are defined as:
union
{
__m128 data;
struct { float x, y, z, w; };
float f[4];
};
Are my implementations flawed, or am I missing something here?
Main issue, if you want to rotate the 3d vector by quaternion, you require to calculate all 9 scalars of rotation matrix. In your examples, calculation of rotation matrix is IMPLICIT. The order of calculation can be not optimal.
If you generate 3x3 matrix from quaternion and multiply vector, you should have same number of arithmetic operations (#see code at bottom).
What i recommend.
Try to generate matrix 3x3 and multiply your vector, measure the speed and compare with previous.
Analyze the explicit solution, and try to optimize for custom architecture.
try to implement alternative quaternion multiplication, and derived multiplication from equation q*v*q'.
//============
alternative multiplication pseudocode
/**
alternative way of quaternion multiplication,
can speedup multiplication for some systems (PDA for example)
http://mathforum.org/library/drmath/view/51464.html
http://www.lboro.ac.uk/departments/ma/gallery/quat/src/quat.ps
in provided code by url's many bugs, have to be rewriten.
*/
inline xxquaternion mul_alt( const xxquaternion& q) const {
float t0 = (x-y)*(q.y-q.x);
float t1 = (w+z)*(q.w+q.z);
float t2 = (w-z)*(q.y+q.x);
float t3 = (x+y)*(q.w-q.z);
float t4 = (x-z)*(q.z-q.y);
float t5 = (x+z)*(q.z+q.y);
float t6 = (w+y)*(q.w-q.x);
float t7 = (w-y)*(q.w+q.x);
float t8 = t5 + t6 + t7;
float t9 = (t4 + t8)*0.5;
return xxquaternion ( t3+t9-t6,
t2+t9-t7,
t1+t9-t8,
t0+t9-t5 );
// 9 multiplications 27 addidtions 8 variables
// but of couse we can clean 4 variables
/*
float r = w, i = z, j = y, k =x;
float br = q.w, bi = q.z, bj = q.y, bk =q.x;
float t0 = (k-j)*(bj-bk);
float t1 = (r+i)*(br+bi);
float t2 = (r-i)*(bj+bk);
float t3 = (k+j)*(br-bi);
float t4 = (k-i)*(bi-bj);
float t5 = (k+i)*(bi+bj);
float t6 = (r+j)*(br-bk);
float t7 = (r-j)*(br+bk);
float t8 = t5 + t6 + t7;
float t9 = (t4 + t8)*0.5;
float rr = t0+t9-t5;
float ri = t1+t9-t8;
float rj = t2+t9-t7;
float rk = t3+t9-t6;
return xxquaternion ( rk, rj, ri, rr );
*/
}
//============
explicit vector rotation variants
/**
rotate vector by quaternion
*/
inline vector3 rotate(const vector3& v)const{
xxquaternion q(v.x * w + v.z * y - v.y * z,
v.y * w + v.x * z - v.z * x,
v.z * w + v.y * x - v.x * y,
v.x * x + v.y * y + v.z * z);
return vector3(w * q.x + x * q.w + y * q.z - z * q.y,
w * q.y + y * q.w + z * q.x - x * q.z,
w * q.z + z * q.w + x * q.y - y * q.x)*( 1.0f/norm() );
// 29 multiplications, 20 addidtions, 4 variables
// 5
/*
// refrence implementation
xxquaternion r = (*this)*xxquaternion(v.x, v.y, v.z, 0)*this->inverted();
return vector3( r.x, r.y, r.z );
*/
/*
// alternative implementation
float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
x2 = q.x + q.x; y2 = q.y + q.y; z2 = q.z + q.z;
xx = q.x * x2; xy = q.x * y2; xz = q.x * z2;
yy = q.y * y2; yz = q.y * z2; zz = q.z * z2;
wx = q.w * x2; wy = q.w * y2; wz = q.w * z2;
return vector3( v.x - v.x * (yy + zz) + v.y * (xy - wz) + v.z * (xz + wy),
v.y + v.x * (xy + wz) - v.y * (xx + zz) + v.z * (yz - wx),
v.z + v.x * (xz - wy) + v.y * (yz + wx) - v.z * (xx + yy) )*( 1.0f/norm() );
// 18 multiplications, 21 addidtions, 12 variables
*/
};
Good luck.

How to speed up bilinear interpolation of image?

I'm trying to rotate image with interpolation, but it's too slow for real time for big images.
the code something like:
for(int y=0;y<dst_h;++y)
{
for(int x=0;x<dst_w;++x)
{
//do inverse transform
fPoint pt(Transform(Point(x, y)));
//in coor of src
int x1= (int)floor(pt.x);
int y1= (int)floor(pt.y);
int x2= x1+1;
int y2= y1+1;
if((x1>=0&&x1<src_w&&y1>=0&&y1<src_h)&&(x2>=0&&x2<src_w&&y2>=0&&y2<src_h))
{
Mask[y][x]= 1; //show pixel
float dx1= pt.x-x1;
float dx2= 1-dx1;
float dy1= pt.y-y1;
float dy2= 1-dy1;
//bilinear
pd[x].blue= (dy2*(ps[y1*src_w+x1].blue*dx2+ps[y1*src_w+x2].blue*dx1)+
dy1*(ps[y2*src_w+x1].blue*dx2+ps[y2*src_w+x2].blue*dx1));
pd[x].green= (dy2*(ps[y1*src_w+x1].green*dx2+ps[y1*src_w+x2].green*dx1)+
dy1*(ps[y2*src_w+x1].green*dx2+ps[y2*src_w+x2].green*dx1));
pd[x].red= (dy2*(ps[y1*src_w+x1].red*dx2+ps[y1*src_w+x2].red*dx1)+
dy1*(ps[y2*src_w+x1].red*dx2+ps[y2*src_w+x2].red*dx1));
//nearest neighbour
//pd[x]= ps[((int)pt.y)*src_w+(int)pt.x];
}
else
Mask[y][x]= 0; //transparent pixel
}
pd+= dst_w;
}
How I can speed up this code, I try to parallelize this code but it seems there is no speed up because of memory access pattern (?).
The key is to do most of your computations as ints. The only thing that is necessary to do as a float is the weighting. See here for a good resource.
From that same resource:
int px = (int)x; // floor of x
int py = (int)y; // floor of y
const int stride = img->width;
const Pixel* p0 = img->data + px + py * stride; // pointer to first pixel
// load the four neighboring pixels
const Pixel& p1 = p0[0 + 0 * stride];
const Pixel& p2 = p0[1 + 0 * stride];
const Pixel& p3 = p0[0 + 1 * stride];
const Pixel& p4 = p0[1 + 1 * stride];
// Calculate the weights for each pixel
float fx = x - px;
float fy = y - py;
float fx1 = 1.0f - fx;
float fy1 = 1.0f - fy;
int w1 = fx1 * fy1 * 256.0f;
int w2 = fx * fy1 * 256.0f;
int w3 = fx1 * fy * 256.0f;
int w4 = fx * fy * 256.0f;
// Calculate the weighted sum of pixels (for each color channel)
int outr = p1.r * w1 + p2.r * w2 + p3.r * w3 + p4.r * w4;
int outg = p1.g * w1 + p2.g * w2 + p3.g * w3 + p4.g * w4;
int outb = p1.b * w1 + p2.b * w2 + p3.b * w3 + p4.b * w4;
int outa = p1.a * w1 + p2.a * w2 + p3.a * w3 + p4.a * w4;
wow you are doing a lot inside most inner loop like:
1.float to int conversions
can do all on floats ...
they are these days pretty fast
the conversion is what is killing you
also you are mixing float and ints together (if i see it right) which is the same ...
2.transform(x,y)
any unnecessary call makes heap trashing and slow things down
instead add 2 variables xx,yy and interpolate them insde your for loops
3.if ....
why to heck are you adding if ?
limit the for ranges before loop and not inside ...
the background can be filled with other fors before or later