I have that piece of code that is responsible for lighting a pyramid.
float Geometric3D::calculateLight(int vert1, int vert2, int vert3) {
float ax = tabX[vert2] - tabX[vert1];
float ay = tabY[vert2] - tabY[vert1];
float az = tabZ[vert2] - tabZ[vert1];
float bx = tabX[vert3] - tabX[vert1];
float by = tabY[vert3] - tabY[vert1];
float bz = tabZ[vert3] - tabZ[vert1];
float Nx = (ay * bz) - (az * by);
float Ny = (az * bx) - (ax * bz);;
float Nz = (ax * by) - (ay * bx);;
float Lx = -300.0f;
float Ly = -300.0f;
float Lz = -1000.0f;
float lenN = sqrtf((Nx * Nx) + (Ny * Ny) + (Nz * Nz));
float lenL = sqrtf((Lx * Lx) + (Ly * Ly) + (Lz * Lz));
float res = ((Nx * Lx) + (Ny * Ly) + (Nz * Lz)) / (lenN * lenL);
if (res < 0.0f)
res = -res;
return res;
}
I cannot understand calculations at the end. Can someone explain me the maths that is behind them? I know that firstly program calculates two vectors of a plane to compute the normal of it (which goes for vector N). Vector L stand for lighting but what happens next? Why do we calculate length of normal and light then multiply it and divide by their sizes?
I got 2 points own=(x, y, z) and en=(x, y, z) which represents my own position in the world and some other player position. the other player also got pitch (from 90 degrees to -90) and yaw (0 to 360). I want to calculate the angles between the other player look and my own position.
In 2D, alpha is what I'm trying to calculate:
int main()
{
float own_x = 1, own_y = 1, own_z = 1;
float en_x = 10, en_y = 1, en_z = 10;
float pi = 3.14159265;
float pitch = 0.f * (pi / 180), yaw = 45.f * (pi / 180);
float x = sin(yaw) * cos(pitch);
float y = sin(pitch);
float z = cos(pitch) * cos(yaw);
float vec_length = sqrt(pow(en_x - own_x, 2) + pow(en_y - own_y, 2) + pow(en_y - own_y, 2));
x /= vec_length;
y /= vec_length;
z /= vec_length;
float cos_t = ((en_x - own_x)*x + (en_y - own_y)*y + (en_z - own_z)*z) / sqrt(pow(en_x - own_x, 2) + pow(en_y - own_y, 2) + pow(en_y - own_y, 2));
float arc = acos(cos_t) * (180 / pi);
return 0;
}
you divide twice with the length of en-own: You should remove
vec_length, and xyz /= vec_length.
your division at cos_t is buggy, you use _y twice in the
expression instead of _y and _z
Note: instead of pow(x, 2), use x*x, it is faster usually (compilers may not optimize pow(x, 2) to x*x).
Here is the code for an oval drawing method I am working on. I am applying the Bresenham method to plot its co-ordinates, and taking advantage of the ellipse's symmetrical properties to draw the same pixel in four different places.
void cRenderClass::plotEllipse(int xCentre, int yCentre, int width, int height, float angle, float xScale, float yScale)
{
if ((height == width) && (abs(xScale - yScale) < 0.005))
plotCircle(xCentre, yCentre, width, xScale);
std::vector<std::vector <float>> rotate;
if (angle > 360.0f)
{
angle -= 180.0f;
}
rotate = maths.rotateMatrix(angle, 'z');
//rotate[0][0] = cos(angle)
//rotate[0][1] = sin(angle)
float theta = atan2(-height*rotate[0][1], width*rotate[0][0]);
if (angle > 90.0f && angle < 180.0f)
{
theta += PI;
}
//add scalation in at a later date
float xShear = (width * (cos(theta) * rotate[0][0])) - (height * (sin(theta) * rotate[0][1]));
float yShear = (width * (cos(theta) * rotate[0][1])) + (height * (sin(theta) * rotate[0][0]));
float widthAxis = abs(sqrt(((rotate[0][0] * width) * (rotate[0][0] * width)) + ((rotate[0][1] * height) * (rotate[0][1] * height))));
float heightAxis = (width * height) / widthAxis;
int aSquared = widthAxis * widthAxis;
int fourASquared = 4*aSquared;
int bSquared = heightAxis * heightAxis;
int fourBSquared = 4*bSquared;
x0 = 0;
y0 = heightAxis;
int sigma = (bSquared * 2) + (aSquared * (1 - (2 * heightAxis)));
while ((bSquared * x0) <= (aSquared * y0))
{
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
if (sigma >= 0)
{
sigma += (fourASquared * (1 - y0));
y0--;
}
sigma += (bSquared * ((4 * x0) + 6));
x0++;
}
x0 = widthAxis;
y0 = 0;
sigma = (aSquared * 2) + (bSquared * (1 - (2 * widthAxis)));
while ((aSquared * y0) <= (bSquared * x0))
{
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) + y0));
drawPixel(xCentre + x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
drawPixel(xCentre - x0, yCentre + ((floor((x0 * yShear) / xShear)) - y0));
if (sigma >= 0)
{
sigma += (fourBSquared * (1 - x0));
x0--;
}
sigma += (aSquared * (4 * y0) + 6);
y0++;
}
//the above algorithm hasn't been quite completed
//there are still a few things I want to enquire Andy about
//before I move on
//this other algorithm definitely works
//however
//it is computationally expensive
//and the line drawing isn't as refined as the first one
//only use this as a last resort
/* std::vector<std::vector <float>> rotate;
rotate = maths.rotateMatrix(angle, 'z');
float s = rotate[0][1];
float c = rotate[0][0];
float ratio = (float)height / (float)width;
float px, py, xNew, yNew;
for (int theta = 0; theta <= 360; theta++)
{
px = (xCentre + (cos(maths.degToRad(theta)) * (width / 2))) - xCentre;
py = (yCentre - (ratio * (sin(maths.degToRad(theta)) * (width / 2)))) - yCentre;
x0 = (px * c) - (py * s);
y0 = (px * s) + (py * c);
drawPixel(x0 + xCentre, y0 + yCentre);
}*/
}
Here's the problem. When testing the rotation matrix on my oval drawing function, I expect it to draw an ellipse at a slant from its original horizontal position as signified by 'angle'. Instead, it makes a heart shape. This is sweet, but not the result I want.
I have managed to get the other algorithm (as seen in the bottom part of that code sample) working successfully, but it takes more time to compute, and doesn't draw lines quite as nicely. I only plan to use that if I can't get this Bresenham one working.
Can anyone help?
I need a faster quaternion-vector multiplication routine for my math library. Right now I'm using the canonical v' = qv(q^-1), which produces the same result as multiplying the vector by a matrix made from the quaternion, so I'm confident in it's correctness.
So far I've implemented 3 alternative "faster" methods:
#1, I have no idea where I got this one from:
v' = (q.xyz * 2 * dot(q.xyz, v)) + (v * (q.w*q.w - dot(q.xyz, q.zyx))) + (cross(q.xyz, v) * q.w * w)
Implemented as:
vec3 rotateVector(const quat& q, const vec3& v)
{
vec3 u(q.x, q.y, q.z);
float s = q.w;
return vec3(u * 2.0f * vec3::dot(u, v))
+ (v * (s*s - vec3::dot(u, u)))
+ (vec3::cross(u, v) * s * 2.0f);
}
#2, courtesy of this fine blog
t = 2 * cross(q.xyz, v);
v' = v + q.w * t + cross(q.xyz, t);
Implemented as:
__m128 rotateVector(__m128 q, __m128 v)
{
__m128 temp = _mm_mul_ps(vec4::cross(q, v), _mm_set1_ps(2.0f));
return _mm_add_ps(
_mm_add_ps(v, _mm_mul_ps(_mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)), temp)),
vec4::cross(q, temp));
}
And #3, from numerous sources,
v' = v + 2.0 * cross(cross(v, q.xyz) + q.w * v, q.xyz);
implemented as:
__m128 rotateVector(__m128 q, __m128 v)
{
//return v + 2.0 * cross(cross(v, q.xyz) + q.w * v, q.xyz);
return _mm_add_ps(v,
_mm_mul_ps(_mm_set1_ps(2.0f),
vec4::cross(
_mm_add_ps(
_mm_mul_ps(_mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)), v),
vec4::cross(v, q)),
q)));
}
All 3 of these produce incorrect results. I have, however, noticed some interesting patterns. First of all, #1 and #2 produce the same result. #3 produces the same result that I get from multiplying the vector by a derived matrix if said matrix is transposed (I discovered this by accident, previously my quat-to-matrix code assumed row-major matrices, which was incorrect).
The data storage of my quaternions are defined as:
union
{
__m128 data;
struct { float x, y, z, w; };
float f[4];
};
Are my implementations flawed, or am I missing something here?
Main issue, if you want to rotate the 3d vector by quaternion, you require to calculate all 9 scalars of rotation matrix. In your examples, calculation of rotation matrix is IMPLICIT. The order of calculation can be not optimal.
If you generate 3x3 matrix from quaternion and multiply vector, you should have same number of arithmetic operations (#see code at bottom).
What i recommend.
Try to generate matrix 3x3 and multiply your vector, measure the speed and compare with previous.
Analyze the explicit solution, and try to optimize for custom architecture.
try to implement alternative quaternion multiplication, and derived multiplication from equation q*v*q'.
//============
alternative multiplication pseudocode
/**
alternative way of quaternion multiplication,
can speedup multiplication for some systems (PDA for example)
http://mathforum.org/library/drmath/view/51464.html
http://www.lboro.ac.uk/departments/ma/gallery/quat/src/quat.ps
in provided code by url's many bugs, have to be rewriten.
*/
inline xxquaternion mul_alt( const xxquaternion& q) const {
float t0 = (x-y)*(q.y-q.x);
float t1 = (w+z)*(q.w+q.z);
float t2 = (w-z)*(q.y+q.x);
float t3 = (x+y)*(q.w-q.z);
float t4 = (x-z)*(q.z-q.y);
float t5 = (x+z)*(q.z+q.y);
float t6 = (w+y)*(q.w-q.x);
float t7 = (w-y)*(q.w+q.x);
float t8 = t5 + t6 + t7;
float t9 = (t4 + t8)*0.5;
return xxquaternion ( t3+t9-t6,
t2+t9-t7,
t1+t9-t8,
t0+t9-t5 );
// 9 multiplications 27 addidtions 8 variables
// but of couse we can clean 4 variables
/*
float r = w, i = z, j = y, k =x;
float br = q.w, bi = q.z, bj = q.y, bk =q.x;
float t0 = (k-j)*(bj-bk);
float t1 = (r+i)*(br+bi);
float t2 = (r-i)*(bj+bk);
float t3 = (k+j)*(br-bi);
float t4 = (k-i)*(bi-bj);
float t5 = (k+i)*(bi+bj);
float t6 = (r+j)*(br-bk);
float t7 = (r-j)*(br+bk);
float t8 = t5 + t6 + t7;
float t9 = (t4 + t8)*0.5;
float rr = t0+t9-t5;
float ri = t1+t9-t8;
float rj = t2+t9-t7;
float rk = t3+t9-t6;
return xxquaternion ( rk, rj, ri, rr );
*/
}
//============
explicit vector rotation variants
/**
rotate vector by quaternion
*/
inline vector3 rotate(const vector3& v)const{
xxquaternion q(v.x * w + v.z * y - v.y * z,
v.y * w + v.x * z - v.z * x,
v.z * w + v.y * x - v.x * y,
v.x * x + v.y * y + v.z * z);
return vector3(w * q.x + x * q.w + y * q.z - z * q.y,
w * q.y + y * q.w + z * q.x - x * q.z,
w * q.z + z * q.w + x * q.y - y * q.x)*( 1.0f/norm() );
// 29 multiplications, 20 addidtions, 4 variables
// 5
/*
// refrence implementation
xxquaternion r = (*this)*xxquaternion(v.x, v.y, v.z, 0)*this->inverted();
return vector3( r.x, r.y, r.z );
*/
/*
// alternative implementation
float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
x2 = q.x + q.x; y2 = q.y + q.y; z2 = q.z + q.z;
xx = q.x * x2; xy = q.x * y2; xz = q.x * z2;
yy = q.y * y2; yz = q.y * z2; zz = q.z * z2;
wx = q.w * x2; wy = q.w * y2; wz = q.w * z2;
return vector3( v.x - v.x * (yy + zz) + v.y * (xy - wz) + v.z * (xz + wy),
v.y + v.x * (xy + wz) - v.y * (xx + zz) + v.z * (yz - wx),
v.z + v.x * (xz - wy) + v.y * (yz + wx) - v.z * (xx + yy) )*( 1.0f/norm() );
// 18 multiplications, 21 addidtions, 12 variables
*/
};
Good luck.
I'm trying to rotate image with interpolation, but it's too slow for real time for big images.
the code something like:
for(int y=0;y<dst_h;++y)
{
for(int x=0;x<dst_w;++x)
{
//do inverse transform
fPoint pt(Transform(Point(x, y)));
//in coor of src
int x1= (int)floor(pt.x);
int y1= (int)floor(pt.y);
int x2= x1+1;
int y2= y1+1;
if((x1>=0&&x1<src_w&&y1>=0&&y1<src_h)&&(x2>=0&&x2<src_w&&y2>=0&&y2<src_h))
{
Mask[y][x]= 1; //show pixel
float dx1= pt.x-x1;
float dx2= 1-dx1;
float dy1= pt.y-y1;
float dy2= 1-dy1;
//bilinear
pd[x].blue= (dy2*(ps[y1*src_w+x1].blue*dx2+ps[y1*src_w+x2].blue*dx1)+
dy1*(ps[y2*src_w+x1].blue*dx2+ps[y2*src_w+x2].blue*dx1));
pd[x].green= (dy2*(ps[y1*src_w+x1].green*dx2+ps[y1*src_w+x2].green*dx1)+
dy1*(ps[y2*src_w+x1].green*dx2+ps[y2*src_w+x2].green*dx1));
pd[x].red= (dy2*(ps[y1*src_w+x1].red*dx2+ps[y1*src_w+x2].red*dx1)+
dy1*(ps[y2*src_w+x1].red*dx2+ps[y2*src_w+x2].red*dx1));
//nearest neighbour
//pd[x]= ps[((int)pt.y)*src_w+(int)pt.x];
}
else
Mask[y][x]= 0; //transparent pixel
}
pd+= dst_w;
}
How I can speed up this code, I try to parallelize this code but it seems there is no speed up because of memory access pattern (?).
The key is to do most of your computations as ints. The only thing that is necessary to do as a float is the weighting. See here for a good resource.
From that same resource:
int px = (int)x; // floor of x
int py = (int)y; // floor of y
const int stride = img->width;
const Pixel* p0 = img->data + px + py * stride; // pointer to first pixel
// load the four neighboring pixels
const Pixel& p1 = p0[0 + 0 * stride];
const Pixel& p2 = p0[1 + 0 * stride];
const Pixel& p3 = p0[0 + 1 * stride];
const Pixel& p4 = p0[1 + 1 * stride];
// Calculate the weights for each pixel
float fx = x - px;
float fy = y - py;
float fx1 = 1.0f - fx;
float fy1 = 1.0f - fy;
int w1 = fx1 * fy1 * 256.0f;
int w2 = fx * fy1 * 256.0f;
int w3 = fx1 * fy * 256.0f;
int w4 = fx * fy * 256.0f;
// Calculate the weighted sum of pixels (for each color channel)
int outr = p1.r * w1 + p2.r * w2 + p3.r * w3 + p4.r * w4;
int outg = p1.g * w1 + p2.g * w2 + p3.g * w3 + p4.g * w4;
int outb = p1.b * w1 + p2.b * w2 + p3.b * w3 + p4.b * w4;
int outa = p1.a * w1 + p2.a * w2 + p3.a * w3 + p4.a * w4;
wow you are doing a lot inside most inner loop like:
1.float to int conversions
can do all on floats ...
they are these days pretty fast
the conversion is what is killing you
also you are mixing float and ints together (if i see it right) which is the same ...
2.transform(x,y)
any unnecessary call makes heap trashing and slow things down
instead add 2 variables xx,yy and interpolate them insde your for loops
3.if ....
why to heck are you adding if ?
limit the for ranges before loop and not inside ...
the background can be filled with other fors before or later