Is my compiler confused with what it thinks to be overloaded functions? - c++
I have the following header functions:
float computeDistance3(Vector3f& vec_a, Vector3f& vec_b);
float computeDotProduct3(Vector3f& vecta, Vector3f& vectb);
float computeGeoDotProd3(Vector3f& vecta, Vector3f& vectb);
With the following definitions
float computeDistance3(Vector3f& vec_a, Vector3f& vec_b) {
float x = vec_a.x - vec_b.x;
float y = vec_a.y - vec_b.y;
float z = vec_a.z - vec_b.z;
return sqrt((x * x) + (y * y) + (z * z));
}
float computeDotProduct3(Vector3f& vec_a, Vector3f vec_b) {
return (vec_a.x * vec_b.x)
+ (vec_a.y * vec_b.y)
+ (vec_a.z * vec_b.z);
}
float computeGeoDotProd3(Vector3f& vecta, Vector3f& vectb) {
float amag, bmag, dotProd;
amag = vecta.computeMagnitude();
bmag = vectb.computeMagnitude();
dotProd = computeDotProduct3(vecta, vectb);
bool notZero = (amag != 0.0f && bmag != 0.0f) && dotProd != 0.0f;
if (notZero) {
return cosf(dotProd / (amag * bmag));
} else {
return -1.0f;
}
}
I know that their signatures are the same. Is this confusing the compiler? I'm guessing so, because when I compile the code, I get this:
vector3f.cpp: In function ‘float computeGeoDotProd(Vector3f&, Vector3f&)’:
vector3f.cpp:139:43: error: call of overloaded ‘computeDotProduct3(Vector3f&, Vector3f&)’ is ambiguous
vector3f.cpp:139:43: note: candidates are:
vector3f.h:31:7: note: float computeDotProduct3(Vector3f&, Vector3f&)
vector3f.cpp:127:7: note: float computeDotProduct3(Vector3f&, Vector3f)
Question
What is the solution to unconfusing the compiler?
You're missing an & in the definition:
float computeDotProduct3(Vector3f& vec_a, Vector3f vec_b) {
should be:
float computeDotProduct3(Vector3f& vec_a, Vector3f& vec_b) {
So you end up with two different (overloaded) function prototypes that differ only by the reference & - hence ambiguous.
Related
Differences between NVCC and NVRTC on compilation to PTX
Summary I'm porting a simple raytracing application based on the Scratchapixel version to a bunch of GPU libraries. I sucessfully ported it to CUDA using the runtime API and the driver API, but It throws a Segmentation fault (core dumped) when I try to use the PTX compiled at runtime with NVRTC. If I uncomment the #include <math.h> directive at the beginning of the kernel file (see below), it still works using NVCC (the generated PTX is exactly the same) but fails at compilation using NVRTC. I want to know how can I make NVRTC behave just like NVCC (is it even possible?), or at least to understand the reason behind this issues. Detailed description File kernel.cu (Kernel source): //#include <math.h> #define MAX_RAY_DEPTH 5 template<typename T> class Vec3 { public: T x, y, z; __device__ Vec3() : x(T(0)), y(T(0)), z(T(0)) {} __device__ Vec3(T xx) : x(xx), y(xx), z(xx) {} __device__ Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {} __device__ Vec3& normalize() { T nor2 = length2(); if (nor2 > 0) { T invNor = 1 / sqrt(nor2); x *= invNor, y *= invNor, z *= invNor; } return *this; } __device__ Vec3<T> operator * (const T &f) const { return Vec3<T>(x * f, y * f, z * f); } __device__ Vec3<T> operator * (const Vec3<T> &v) const { return Vec3<T>(x * v.x, y * v.y, z * v.z); } __device__ T dot(const Vec3<T> &v) const { return x * v.x + y * v.y + z * v.z; } __device__ Vec3<T> operator - (const Vec3<T> &v) const { return Vec3<T>(x - v.x, y - v.y, z - v.z); } __device__ Vec3<T> operator + (const Vec3<T> &v) const { return Vec3<T>(x + v.x, y + v.y, z + v.z); } __device__ Vec3<T>& operator += (const Vec3<T> &v) { x += v.x, y += v.y, z += v.z; return *this; } __device__ Vec3<T>& operator *= (const Vec3<T> &v) { x *= v.x, y *= v.y, z *= v.z; return *this; } __device__ Vec3<T> operator - () const { return Vec3<T>(-x, -y, -z); } __device__ T length2() const { return x * x + y * y + z * z; } __device__ T length() const { return sqrt(length2()); } }; typedef Vec3<float> Vec3f; typedef Vec3<bool> Vec3b; class Sphere { public: const char* id; Vec3f center; /// position of the sphere float radius, radius2; /// sphere radius and radius^2 Vec3f surfaceColor, emissionColor; /// surface color and emission (light) float transparency, reflection; /// surface transparency and reflectivity int animation_frame; Vec3b animation_position_rand; Vec3f animation_position; Sphere( const char* id, const Vec3f &c, const float &r, const Vec3f &sc, const float &refl = 0, const float &transp = 0, const Vec3f &ec = 0) : id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc), emissionColor(ec), transparency(transp), reflection(refl) { animation_frame = 0; } //[comment] // Compute a ray-sphere intersection using the geometric solution //[/comment] __device__ bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const { Vec3f l = center - rayorig; float tca = l.dot(raydir); if (tca < 0) return false; float d2 = l.dot(l) - tca * tca; if (d2 > radius2) return false; float thc = sqrt(radius2 - d2); t0 = tca - thc; t1 = tca + thc; return true; } }; __device__ float mix(const float &a, const float &b, const float &mixval) { return b * mixval + a * (1 - mixval); } __device__ Vec3f trace( const Vec3f &rayorig, const Vec3f &raydir, const Sphere *spheres, const unsigned int spheres_size, const int &depth) { float tnear = INFINITY; const Sphere* sphere = NULL; // find intersection of this ray with the sphere in the scene for (unsigned i = 0; i < spheres_size; ++i) { float t0 = INFINITY, t1 = INFINITY; if (spheres[i].intersect(rayorig, raydir, t0, t1)) { if (t0 < 0) t0 = t1; if (t0 < tnear) { tnear = t0; sphere = &spheres[i]; } } } // if there's no intersection return black or background color if (!sphere) return Vec3f(2); Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray Vec3f phit = rayorig + raydir * tnear; // point of intersection Vec3f nhit = phit - sphere->center; // normal at the intersection point nhit.normalize(); // normalize normal direction // If the normal and the view direction are not opposite to each other // reverse the normal direction. That also means we are inside the sphere so set // the inside bool to true. Finally reverse the sign of IdotN which we want // positive. float bias = 1e-4; // add some bias to the point from which we will be tracing bool inside = false; if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true; if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < MAX_RAY_DEPTH) { float facingratio = -raydir.dot(nhit); // change the mix value to tweak the effect float fresneleffect = mix(pow(1 - facingratio, 3), 1, 0.1); // compute reflection direction (not need to normalize because all vectors // are already normalized) Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit); refldir.normalize(); Vec3f reflection = trace(phit + nhit * bias, refldir, spheres, spheres_size, depth + 1); Vec3f refraction = 0; // if the sphere is also transparent compute refraction ray (transmission) if (sphere->transparency) { float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface? float cosi = -nhit.dot(raydir); float k = 1 - eta * eta * (1 - cosi * cosi); Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k)); refrdir.normalize(); refraction = trace(phit - nhit * bias, refrdir, spheres, spheres_size, depth + 1); } // the result is a mix of reflection and refraction (if the sphere is transparent) surfaceColor = ( reflection * fresneleffect + refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor; } else { // it's a diffuse object, no need to raytrace any further for (unsigned i = 0; i < spheres_size; ++i) { if (spheres[i].emissionColor.x > 0) { // this is a light Vec3f transmission = 1; Vec3f lightDirection = spheres[i].center - phit; lightDirection.normalize(); for (unsigned j = 0; j < spheres_size; ++j) { if (i != j) { float t0, t1; if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) { transmission = 0; break; } } } surfaceColor += sphere->surfaceColor * transmission * max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor; } } } return surfaceColor + sphere->emissionColor; } extern "C" __global__ void raytrace_kernel(unsigned int width, unsigned int height, Vec3f *image, Sphere *spheres, unsigned int spheres_size, float invWidth, float invHeight, float aspectratio, float angle) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y < height && x < width) { float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio; float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle; Vec3f raydir(xx, yy, -1); raydir.normalize(); image[y*width+x] = trace(Vec3f(0), raydir, spheres, spheres_size, 0); } } I can successfully compile it with: nvcc --ptx kernel.cu -o kernel.ptx (full PTX here) and use that PTX in the driver API with cuModuleLoadDataEx using the following snippet. It works as expected. It works fine even if I uncomment the #include <math.h> line (actually, the PTX generated is exactly the same). CudaSafeCall( cuInit(0) ); CUdevice device; CudaSafeCall( cuDeviceGet(&device, 0) ); CUcontext context; CudaSafeCall( cuCtxCreate(&context, 0, device) ); unsigned int error_buffer_size = 1024; std::vector<CUjit_option> options; std::vector<void*> values; char* error_log = new char[error_buffer_size]; options.push_back(CU_JIT_ERROR_LOG_BUFFER); //Pointer to a buffer in which to print any log messages that reflect errors values.push_back(error_log); options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator) values.push_back(&error_buffer_size); options.push_back(CU_JIT_TARGET_FROM_CUCONTEXT); //Determines the target based on the current attached context (default) values.push_back(0); //No option value required for CU_JIT_TARGET_FROM_CUCONTEXT CUmodule module; CUresult status = cuModuleLoadDataEx(&module, ptxSource, options.size(), options.data(), values.data()); if (error_log && error_log[0]) { //https://stackoverflow.com/a/7970669/3136474 std::cout << "Compiler error: " << error_log << std::endl; } CudaSafeCall( status ); However, whenever I try to compile this exact kernel using NVRTC (full PTX here), it compiles successfully but gives me a Segmentation fault (core dumped) on the call to cuModuleLoadDataEx (when trying to use the resulting PTX). If I uncomment the #include <math.h> line, it fails at the nvrtcCompileProgram call with the following output: nvrtcSafeBuild() failed at cuda_raytracer_nvrtc_api.cpp:221 : NVRTC_ERROR_COMPILATION Build log: /usr/include/bits/mathcalls.h(177): error: linkage specification is incompatible with previous "isinf" __nv_nvrtc_builtin_header.h(126689): here /usr/include/bits/mathcalls.h(211): error: linkage specification is incompatible with previous "isnan" __nv_nvrtc_builtin_header.h(126686): here 2 errors detected in the compilation of "kernel.cu". The code I'm using to compile it with NVRTC is: nvrtcProgram prog; NvrtcSafeCall( nvrtcCreateProgram(&prog, kernelSource, "kernel.cu", 0, NULL, NULL) ); // https://docs.nvidia.com/cuda/nvrtc/index.html#group__options std::vector<const char*> compilationOpts; compilationOpts.push_back("--device-as-default-execution-space"); // NvrtcSafeBuild is a macro which automatically prints nvrtcGetProgramLog if the compilation fails NvrtcSafeBuild( nvrtcCompileProgram(prog, compilationOpts.size(), compilationOpts.data()), prog ); size_t ptxSize; NvrtcSafeCall( nvrtcGetPTXSize(prog, &ptxSize) ); char* ptxSource = new char[ptxSize]; NvrtcSafeCall( nvrtcGetPTX(prog, ptxSource) ); NvrtcSafeCall( nvrtcDestroyProgram(&prog) ); Then I simply load the ptxSource using the previous snippet (note: that code block is the same used for both the driver API version and the NVRTC version). Additional things that I've noticed/tried so far The PTX generated by the NVCC and the one generated by NVRTC are quite different, but I'm unable to understand them to identify possible problems. Tried to specify the specific GPU architecture (in my case, CC 6.1) to the compiler, no difference. Tried to disable any compiler optimizations (options --ftz=false --prec-sqrt=true --prec-div=true --fmad=false in nvrtcCompileProgram). PTX file got bigger, but still Segfaulting. Tried to add --std=c++11 or --std=c++14 to the NVRTC compiler options. With any of them NVRTC generates an almost empty (4 lines) PTX but issue no warning nor error until I try to use it. Environment SO: Ubuntu 18.04.4 LTS 64-bit nvcc --version: Cuda compilation tools, release 10.1, V10.1.168. Built on Wed_Apr_24_19:10:27_PDT_2019 gcc --version: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 Hardware: Intel I7-7700HQ, GeForce GTX 1050 Ti Edit on OP+1 day I forgot to add my environment. See previous section. Also can you compile the nvrtc output with ptxas? – #talonmies' comment The nvcc-generated PTX compiles with a warning: $ ptxas -o /tmp/temp_ptxas_output.o kernel.ptx ptxas warning : Stack size for entry function 'raytrace_kernel' cannot be statically determined Which is due to the recursive kernel function (more on that). It can be safely ignored. The nvrtc-generated PTX does not compile and issues the error: $ ptxas -o /tmp/temp_ptxas_output.o nvrtc_kernel.ptx ptxas fatal : Unresolved extern function '_Z5powiffi' Based on this question I added __device__ to Sphere class constructor and removed --device-as-default-execution-space compiler option. It generates a slightly different PTX now, but still presents the same error. Compiling with the #include <math.h> now generates a lot of "A function without execution space annotations is considered a host function, and host functions are not allowed in JIT mode." warnings besides the previous errors. If I try to use the accepted solution of the question it throws me a bunch of syntax errors and does not compile. NVCC still works flawlessly.
Just found the culprit by the ancient comment-and-test method: the error goes away if I remove the pow call used to calculate the fresnel effect inside the trace method. For now, I've just replaced pow(var, 3) for var*var*var. I created a MVCE and filled a bug report to NVIDIA: https://developer.nvidia.com/nvidia_bug/2917596. Which Liam Zhang answered and pointed me the problem: The issue in your code is that there is an incorrect option value being passed to cuModuleLoadDataEx. In lines: options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator) values.push_back(&error_buffer_size); the buffer size option is provided, but instead of passing a value with the size, a pointer to that value is passed. Since this pointer is then read as a number, the driver assumed a much larger buffer size than 1024. During the NVRTC compilation a "Unresolved extern function" error occurred, because the pow function signature, as you can find in the documentation is: __device__ double pow ( double x, double y ) When the driver tried to zero the buffer when putting the error message in it, the segfault happened. Without the call to pow, there was no compilation error, so the error buffer was not used and there was no segfault. To ensure the device code is correct, the values used to call pow function as well as the output pointer should be a double number, or a float equivalent function, powf, could be used. If I change the call to values.push_back((void*)error_buffer_size); it reports the same error as ptxas compilation of the generated PTX: Compiler error: ptxas fatal : Unresolved extern function '_Z5powiffi' cudaSafeCall() failed at file.cpp:74 : CUDA_ERROR_INVALID_PTX - a PTX JIT compilation failed
CUDA Ray-Sphere intersection random walk spooky values
Results The above results are the |X|Y|Z|AbsDistance of each sphere intersection, random spooky values appear probably because of a newbie mistake, but I really can't get it. To be as specific as I can: The following snippet is supposed to calculate the intersection point between a ray and a spherical boundary with a predefined radius and the origin as the center. To give more context: 1- The RandomWalk starts from the origin and moves with a randomly generated _step and _direction. 2- After each step, the ray is checked for hitting possibility by comparing the absolute distance to the radius of the boundary. 3- getIntersectionPoint() returns the point of intersection, but as the number of points or number of steps increases, the probability of outcasts increases, messing up the whole thing. Here's what I've done: #include <curand.h> #include <curand_kernel.h> #include <iostream> #define N 256 // Number of photons #define THREADS_PER_BLOCK 256 // Threads per Block #define BOUNDARY_RADIUS 5.0 class Point{ private: float _x; float _y; float _z; public: __device__ __host__ Point(float x, float y, float z){ setCoordinates(x, y, z); } __device__ __host__ Point(){ setCoordinates(0.f, 0.f, 0.f); } __device__ __host__ void setCoordinates(float x, float y, float z) { this->_x = x; this->_y = y; this->_z = z; } __device__ __host__ float getX() const { return this->_x; } __device__ __host__ float getY() const { return this->_y; } __device__ __host__ float getZ() const { return this->_z; } __device__ __host__ Point add(Point point){ float result_x = this->_x + point.getX(); float result_y = this->_y + point.getY(); float result_z = this->_z + point.getZ(); return Point( result_x, result_y, result_z ); } __device__ __host__ Point subtract(Point point){ float result_x = this->_x - point.getX(); float result_y = this->_y - point.getY(); float result_z = this->_z - point.getZ(); return Point( result_x, result_y, result_z ); } }; class RNG{ private: __device__ float generate( curandState* globalState, int i) { curandState localState = globalState[i]; float random = curand_uniform( &localState ); globalState[i] = localState; return random; } public: __device__ float getRandomStep( curandState* globalState , int i) { float step = 0.f; // Intialize for step value step = generate (globalState, i); return step; } __device__ Point getRandomPoint( curandState* globalState , int i) { float u = generate (globalState , i); float v = generate (globalState, i); float theta = 2 * M_PI * u; float phi = acos(1 - 2 * v); // Transforming into the cartesian space float x = sin(phi) * cos(theta); float y = sin(phi) * sin(theta); float z = cos(phi); return Point(x,y,z); } }; class Ray{ private: Point _prevPos; Point _currentPos; Point _direction; float _step; public: __device__ Ray(Point startingPoint, Point direction){ this->_currentPos.setCoordinates(startingPoint.getX(), startingPoint.getY(), startingPoint.getZ()); this->_direction.setCoordinates(direction.getX(), direction.getY(), direction.getZ()); } __device__ void setDirection(Point direction) { this->_direction.setCoordinates(direction.getX(), direction.getY(), direction.getZ()); } __device__ void setStep(float step) { this->_step = step; } __device__ Point getCurrentPos() const { return this->_currentPos; } __device__ Point getDirection() const { return this->_direction; } __device__ Point getPrevPos() const { return this->_prevPos; } __device__ float getStep() const { return this->_step; } __device__ void move(Point direction, float step) // The point moves in the specified direction with the given step { this->_prevPos = this->_currentPos; this->_direction = direction; this->_step = step; float newX = this->_currentPos.getX() + (direction.getX() * step); float newY = this->_currentPos.getY() + (direction.getY() * step); float newZ = this->_currentPos.getZ() + (direction.getZ() * step); this->_currentPos.setCoordinates(newX, newY, newZ); } }; class Boundary{ private: float _radius; Point _center; __device__ float dot(Point point1, Point point2){return point1.getX()*point2.getX() + point1.getY()*point2.getY() + point1.getZ()*point2.getZ();} public: __device__ __host__ Boundary(float r, Point c){ _radius = r; _center = c; } __device__ bool isCrossed(Ray ray){ float absDistance = (float) sqrtf((float) powf(ray.getCurrentPos().getX(),2) + (float) powf(ray.getCurrentPos().getY(),2) + (float) powf(ray.getCurrentPos().getZ(),2)); if(absDistance >= _radius){ return true; } else { return false; } }; __device__ Point getIntersectionPoint(Ray ray){ Point A = ray.getPrevPos(); Point B = ray.getDirection(); Point S = A.add(_center); Point A_C = A.subtract(_center); float a = dot(B, B); float b = 2.0 * dot(B, A_C); float c = dot(A_C, A_C) - _radius*_radius; float discriminant = b*b - 4*a*c; float t1 = (-b + sqrtf(discriminant)) / (2.0*a); float t2 = (-b - sqrtf(discriminant)) / (2.0*a); float t; if(t1 < 0){ t = t2; } else { t = t1; } return Point((A.getX()+B.getX()*t),(A.getY()+B.getY()*t),(A.getZ()+B.getZ()*t)); } }; /** * #brief randomWalk * keeps wandering around with the photon in the 3D space * #return The Point where the Photon hits the Boundary */ __device__ Point randomWalk(curandState_t *states, int idx, Boundary boundary, RNG rng) { Ray ray = Ray(Point(0.f, 0.f, 0.f), Point(0.f, 0.f, 0.f)); while (!boundary.isCrossed(ray)) { ray.move(rng.getRandomPoint(states, idx), rng.getRandomStep(states, idx)); } return boundary.getIntersectionPoint(ray); } void streamOut(Point* _cpuPoints); __global__ void finalPosition(unsigned int seed, curandState_t* states, Point* _gpuPoints,Boundary boundary,RNG rng) { int idx = blockIdx.x*blockDim.x+threadIdx.x; curand_init(seed, idx, 0, &states[idx]); Point finalPos; finalPos = randomWalk(states, idx, boundary, rng); _gpuPoints[idx] = finalPos; } int main() { int nBlocks = N/THREADS_PER_BLOCK + 1; curandState_t* states; cudaMalloc((void**) &states, N * sizeof(curandState_t)); // Allocate host memory for final positions Point * _cpuPoints= (Point*)malloc(sizeof(Point) * N); // Allocate device memory for final positions Point* _gpuPoints = nullptr; cudaMalloc((void**) &_gpuPoints, N * sizeof(Point)); // Initializing the Boundary and the Random Number Generator Boundary boundary = Boundary(BOUNDARY_RADIUS, Point(0.f, 0.f, 0.f)); RNG rng; // Call Kernel finalPosition<<<nBlocks,THREADS_PER_BLOCK>>>(time(0), states , _gpuPoints, boundary, rng); // Copy device data to host memory to stream them out cudaMemcpy(_cpuPoints, _gpuPoints, N* sizeof( Point), cudaMemcpyDeviceToHost); streamOut (&_cpuPoints[0]); free(_cpuPoints); cudaFree(_gpuPoints); return 0; } void streamOut(Point* _cpuPoints) { FILE *output; output = fopen("output.csv", "w"); for (int i = 0; i < N; i++) { // Streaming out my output in a log file float absDistance = (float) sqrtf((float) powf(_cpuPoints[i].getX(), 2) + (float) powf(_cpuPoints[i].getY(), 2) + (float) powf(_cpuPoints[i].getZ(), 2)); fprintf(output, "%f,%f,%f,%f\n", _cpuPoints[i].getX(), _cpuPoints[i].getY(), _cpuPoints[i].getZ(), absDistance); } }
Any time you are having trouble with a CUDA code, I recommend using proper CUDA error checking and run your code with cuda-memcheck. When I run your code with cuda-memcheck, I get a variety of errors. This means your kernel code is making illegal, out-of-bounds accesses. You can start to track this down using the method described here. One problem in your code is that you are launching more blocks/threads than what your allocation size N dictates: int nBlocks = N/THREADS_PER_BLOCK + 1; This means some of the threads in your kernel launch will make out-of-bounds accesses. You need to address this with a thread check (if statement) in your kernel code. When I take your code as posted, and modify the kernel like this: __global__ void finalPosition(unsigned int seed, curandState_t* states, Point* _gpuPoints,Boundary boundary,RNG rng, int n) { int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n){ curand_init(seed, idx, 0, &states[idx]); Point finalPos; finalPos = randomWalk(states, idx, boundary, rng); _gpuPoints[idx] = finalPos;} } and the kernel launch like this: finalPosition<<<nBlocks,THREADS_PER_BLOCK>>>(time(0), states , _gpuPoints, boundary, rng, N); I get this result (output.csv): $ cat output.csv 0.628292,-4.899494,0.774730,5.000000 0.162323,-4.930647,-0.813861,5.000000 -1.715985,-0.534316,-4.665823,5.000000 -2.411644,-3.632435,-2.447323,5.000000 -3.418264,-0.851781,3.548231,5.000000 -2.850476,-2.937130,-2.871943,5.000000 0.072410,3.170733,-3.865386,5.000000 1.959057,-0.443189,-4.578829,5.000000 2.031133,-3.467616,-2.974919,5.000000 -2.107327,3.904619,2.305021,5.000000 -4.639953,1.667007,-0.831821,5.000000 3.720370,1.624804,2.918708,5.000000 -1.534095,-3.247724,3.478339,5.000000 -3.888582,0.315719,-3.127179,5.000000 -0.054493,-4.998784,-0.095864,5.000000 3.298623,3.518482,1.318854,5.000000 4.641367,-1.859068,-0.039635,5.000000 -3.671611,-0.072624,3.393228,5.000000 -1.256829,-0.310876,4.829465,5.000000 -2.492307,-4.182354,1.138562,5.000000 1.395312,-2.987793,3.758483,5.000000 -2.762215,-3.152503,2.726151,5.000000 3.101520,-1.983825,-3.383048,5.000000 -2.169484,3.941614,2.181059,5.000000 -3.971401,-1.138357,-2.816402,5.000000 -2.118435,-1.203381,4.366246,5.000000 3.319744,-3.698802,-0.546043,5.000000 2.737933,3.012805,-2.902883,5.000000 -2.870568,-0.945093,3.983295,5.000000 3.576528,-2.390892,2.547957,5.000000 4.602388,0.700673,1.824028,5.000000 0.122336,4.979045,-0.440617,5.000000 -0.935764,-1.534525,4.665789,5.000000 3.667711,-3.357755,0.522854,5.000000 -1.289282,1.290344,4.655402,5.000000 -3.764930,-3.280344,-0.254253,5.000000 4.267314,-0.811147,2.476302,5.000000 -3.693138,3.297244,-0.699224,5.000000 -1.038960,-2.293650,-4.319691,5.000000 -4.245689,0.974306,2.454558,5.000000 -3.710622,2.254789,2.479358,5.000000 -0.739412,-2.375453,-4.337107,5.000000 -1.122346,0.997810,4.769142,5.000000 4.641891,-1.289307,-1.338109,5.000000 3.943014,-2.680164,-1.506439,5.000000 -1.657783,0.458186,-4.694871,5.000000 2.903168,-3.962222,0.934030,5.000000 1.922109,4.382765,-1.448057,5.000000 2.943883,4.041326,0.035208,5.000000 3.264783,1.974566,-3.231452,5.000000 -3.273946,-2.057536,-3.169830,5.000000 0.055952,-3.367576,3.695442,5.000000 -0.072741,-4.568989,-2.029546,5.000000 -0.157276,4.870314,-1.120405,5.000000 1.299422,0.099700,4.827169,5.000000 2.791323,2.083337,3.587231,5.000000 -0.769589,2.674135,-4.154123,5.000000 -0.424974,2.674058,4.203428,5.000000 -1.297806,1.828922,4.468864,5.000000 1.356144,3.977489,-2.709327,5.000000 -4.020390,1.910192,2.277636,5.000000 0.859541,-4.891906,-0.574843,5.000000 0.760309,4.836938,-1.012894,5.000000 -4.918316,0.898741,-0.049279,5.000000 2.159176,-0.357519,4.495569,5.000000 1.337239,3.632694,-3.164700,5.000000 1.287019,3.640088,3.177001,5.000000 4.175551,-2.552966,1.023299,5.000000 -4.189130,-2.710545,0.322699,5.000000 -3.775866,0.422600,3.250269,5.000000 1.227863,1.939098,-4.442100,5.000000 -0.910808,-0.769251,-4.855789,5.000000 -2.836509,-4.018154,0.899253,5.000000 -0.943431,-4.248322,-2.462051,5.000000 4.839777,-0.542668,1.132283,5.000000 -0.543598,-4.860043,1.041386,5.000000 2.096293,0.731096,4.480072,5.000000 1.515222,-4.503112,1.557589,5.000000 0.391035,-2.820461,-4.109999,5.000000 -4.697918,1.659874,-0.417592,5.000000 0.731389,4.766176,1.322361,5.000000 -4.971092,0.391872,0.366991,5.000000 4.683945,1.503105,-0.895171,5.000000 0.094646,0.803327,4.934137,5.000000 -4.756599,-1.063119,1.115591,5.000000 -4.741367,0.601832,1.468751,5.000000 -0.622062,-4.399431,-2.293043,5.000000 3.998584,-2.430138,-1.762316,5.000000 2.889354,3.753414,-1.601098,5.000000 4.619578,-1.843518,0.510819,5.000000 -3.468601,-3.576452,-0.421662,5.000000 -2.446475,-3.452250,-2.663969,5.000000 0.611008,4.935348,-0.518658,5.000000 3.356182,3.689818,-0.348258,5.000000 2.723260,-4.022014,-1.186279,5.000000 2.515270,-3.615386,-2.366938,5.000000 1.461690,-4.704300,0.856170,5.000000 -1.220645,3.493145,3.362731,5.000000 -3.669620,3.239435,-1.019782,5.000000 -3.316329,1.182409,3.550193,5.000000 4.916836,-0.796389,0.436450,5.000000 -0.622584,-1.670654,4.671328,5.000000 -1.539724,3.515036,-3.205272,5.000000 -2.272659,3.932659,-2.090267,5.000000 1.659590,0.629000,-4.674411,5.000000 2.067366,-4.000756,2.172545,5.000000 -3.875296,1.900115,-2.524211,5.000000 3.605831,-3.310765,1.018244,5.000000 -0.772092,-4.551371,-1.920650,5.000000 2.968601,-4.023230,-0.032172,5.000000 -1.503622,-3.879141,2.773334,5.000000 -1.722315,-1.940946,-4.273916,5.000000 1.193075,3.128174,3.713637,5.000000 4.582112,1.741668,-0.985310,5.000000 -1.585273,-3.350112,-3.356137,5.000000 3.985136,-2.446971,-1.769469,5.000000 3.462019,2.040801,-2.974820,5.000000 2.336477,1.321345,4.218403,5.000000 -1.968305,4.097646,-2.082083,5.000000 3.373862,1.969776,3.120422,5.000000 -4.997004,0.112979,-0.131096,5.000000 -3.184446,1.498715,-3.551501,5.000000 -4.962571,0.586419,0.170300,5.000000 -2.533729,3.452926,-2.580216,5.000000 -0.292847,4.670508,1.760851,5.000000 4.836363,1.059029,0.698605,5.000000 2.820885,-4.074259,-0.665598,5.000000 -2.115496,4.106643,1.913154,5.000000 1.624954,3.679764,2.969656,5.000000 4.967940,-0.505954,-0.252150,5.000000 -4.672419,-1.567572,-0.843337,5.000000 3.070334,2.869947,-2.708589,5.000000 2.897243,3.452626,2.164568,5.000000 -3.926629,-1.834329,-2.493356,5.000000 1.167627,-3.817905,3.010024,5.000000 3.711214,2.119984,2.594717,5.000000 2.891797,0.411721,-4.058078,5.000000 -4.938633,-0.489490,0.608526,5.000000 2.090108,3.137994,3.283968,5.000000 -4.941360,0.246557,-0.722615,5.000000 3.025169,3.877938,-0.899971,5.000000 -0.057637,-1.374093,4.807135,5.000000 0.834437,4.757398,-1.292628,5.000000 -2.652762,1.304395,4.032544,5.000000 -2.801193,2.044116,3.602070,5.000000 3.026658,-3.871065,0.924228,5.000000 4.370097,2.405023,0.343677,5.000000 2.026850,-2.908674,-3.525833,5.000000 0.569686,-4.203772,-2.646461,5.000000 -4.491343,-1.708262,1.381909,5.000000 -2.891552,-2.594137,-3.147916,5.000000 4.539808,1.979339,-0.687284,5.000000 3.895631,-2.867161,1.266274,5.000000 -4.846499,-0.577102,-1.085540,5.000000 3.875701,-1.328161,-2.866169,5.000000 -3.538635,-1.043489,3.374788,5.000000 -0.142181,-4.997194,0.088522,5.000000 -1.737878,2.866842,-3.709582,5.000000 2.625108,-2.538683,3.415245,5.000000 -1.590130,-2.351951,4.115800,5.000000 -2.037973,-3.422420,3.022202,5.000000 1.821700,-3.040913,3.526224,5.000000 0.371202,4.985830,-0.060914,5.000000 4.683237,-0.619066,-1.638306,5.000000 2.398519,-4.361123,-0.477192,5.000000 3.776791,1.066108,3.098268,5.000000 -1.047291,2.938777,-3.907271,5.000000 -3.603310,1.989184,2.838891,5.000000 1.356899,3.938558,-2.765246,5.000000 4.458138,1.741543,1.446385,5.000000 -1.534129,2.878480,3.789565,5.000000 0.197157,-1.983445,-4.585529,5.000000 1.337347,4.006208,-2.676154,5.000000 1.458471,-3.795124,2.910309,5.000000 -0.761919,4.939224,0.153434,5.000000 4.058095,2.718427,1.068650,5.000000 -4.893477,-0.106711,1.021024,5.000000 -0.265158,4.754215,1.525495,5.000000 3.515460,3.011975,1.889325,5.000000 3.462154,-0.533415,3.567767,5.000000 -1.368027,3.607505,3.180316,5.000000 -3.604319,3.357449,0.858149,5.000000 -4.472452,1.961971,1.071376,5.000000 0.718252,-1.359473,-4.757725,5.000000 2.046570,4.513433,-0.663678,5.000000 -0.944693,1.298234,4.735203,5.000000 4.330425,-0.872982,-2.342077,5.000000 -3.978647,2.122892,2.159560,5.000000 0.277452,-0.264253,4.985297,5.000000 4.141907,-1.149088,2.554252,5.000000 -0.233733,-4.282661,2.569860,5.000000 -1.016823,1.102844,-4.769676,5.000000 2.170278,0.935831,4.406145,5.000000 -0.687388,0.576073,-4.918906,5.000000 -2.245424,-0.350784,4.453653,5.000000 1.744440,-2.652875,-3.862536,5.000000 -2.825023,3.965294,-1.138282,5.000000 4.314280,0.748011,-2.414015,5.000000 3.495596,-3.334958,1.287970,5.000000 1.075361,3.787970,3.081377,5.000000 -1.785661,-2.453979,-3.973588,5.000000 2.323098,4.402976,0.465858,5.000000 0.018438,-4.230187,-2.665554,5.000000 4.348341,-1.989394,1.460906,5.000000 -1.429905,4.039118,-2.576994,5.000000 -1.250704,0.451433,-4.819953,5.000000 0.167199,-2.760357,4.165630,5.000000 -2.622747,4.252591,-0.191487,5.000000 -4.118810,1.311523,2.513029,5.000000 0.877228,-4.601123,1.749324,5.000000 2.018730,-2.815201,-3.605464,5.000000 -1.223365,-4.307477,2.224640,5.000000 -1.950934,4.297880,1.649874,5.000000 -0.131930,-0.339385,4.986723,5.000000 1.781163,4.312893,1.796221,5.000000 1.854860,-4.261953,1.842621,5.000000 4.920663,0.883414,-0.081617,5.000000 -3.450862,-3.552723,-0.685352,5.000000 0.141668,-3.981927,-3.020627,5.000000 -0.710307,1.609614,-4.680235,5.000000 -0.913173,2.112117,4.439040,5.000000 3.299812,-3.403087,1.590674,5.000000 -4.896513,0.458815,-0.902026,5.000000 -4.692033,-1.111317,-1.322802,5.000000 1.252288,-1.454753,4.616868,5.000000 -4.081772,-2.074811,-2.008556,5.000000 -1.759481,-0.117314,4.678725,5.000000 4.809183,-0.342558,-1.324544,5.000000 0.667401,2.304548,4.386756,5.000000 0.739023,-3.974324,-2.942548,5.000000 -1.563058,4.086929,-2.419476,5.000000 1.173510,2.061344,4.401560,5.000000 0.668464,4.852658,-1.002429,5.000000 -4.419933,2.054977,1.114117,5.000000 -1.110164,-4.821075,-0.724410,5.000000 0.445619,0.343362,-4.968252,5.000000 3.654497,2.173541,2.630659,5.000000 -0.369989,0.458368,-4.965179,5.000000 2.885627,0.763048,4.011348,5.000000 -0.180556,4.945003,0.717179,5.000000 0.337894,1.747535,4.672467,5.000000 -1.756681,4.395498,1.610487,5.000000 3.864833,-2.428329,-2.041148,5.000000 -3.238250,2.699637,-2.688065,5.000000 3.534358,-3.506072,0.464514,5.000000 -4.732568,1.448340,0.710711,5.000000 -2.408191,1.378415,-4.159398,5.000000 -4.525957,-1.632861,-1.359957,5.000000 3.973403,2.903845,-0.883036,5.000000 -4.613844,1.607693,1.061963,5.000000 0.118624,-3.210011,3.831678,5.000000 4.347818,2.467214,-0.096607,5.000000 -3.273662,-3.779224,0.024393,5.000000 -1.548274,-1.125134,4.619190,5.000000 2.784302,-1.157474,-3.988473,5.000000 -3.413198,-3.552125,-0.855856,5.000000 which does not appear to have "spooky" values in it. (And the cuda-memcheck errors disappear.) If there are still "spooky" values, then you're going to need to provide a clearer definition of the results you expect.
Slerp interpolation of angle results in -nan(ind)
I'm trying to interpolate a 2D angle and it works 99.9% of the time. For some reason I'm getting -nan(ind) for some values, like: lastAngle = -0.0613451 currentAngle = -0.061421 alpha = 0.218813 This is the code: inline float slerpRotation(const float& angle1, const float& angle2, const float& alpha) { auto v1 = b2Vec2{std::cos(angle1), std::sin(angle1)}; auto v2 = b2Vec2{std::cos(angle2), std::sin(angle2)}; auto v = this->slerp(v1, v2, alpha); return std::atan2(v.y, v.x); } inline b2Vec2 slerp(const b2Vec2& v1, const b2Vec2& v2, const float& alpha) { auto cosAngle = v1.x * v2.x + v1.y * v2.y; auto angle = std::acos(cosAngle); auto angleAlpha = angle * alpha; auto v3 = (v2 - (cosAngle * v1)).Normalize(); auto x = v1.x * std::cos(angleAlpha) + v3 * std::sin(angleAlpha); auto y = v1.y * std::cos(angleAlpha) + v3 * std::sin(angleAlpha); return b2Vec2{x, y}; } All this examples results in inf num: slerpRotation(-0.0613451f, -0.061421f, 0.218813f); slerpRotation(-1.63139f, -1.63139f, 0.723703f); slerpRotation(-0.0614404f, -0.0614034f, 0.199831f); slerpRotation(0.0194162f, 0.0194164f, 0.259074f); I've tried to solve this problem for a while now without knowing what causes this problem, do you guys happened to know how to solve this?
In the end you are computing angle1+alpha*(angle2-angle1) or if you want to exclude some fringe cases, angle1+alpha*reduce2pi(angle2-angle1) where reduce2pi(phi) = fmod( 3*pi + fmod(phi, 2*pi), 2*pi)-pi Note that these formulas are completely singularity free, as there is no division. It is not necessary to switch forth and back between angles and their point on the unit circle. In code, that would be inline float slerpRotation(const float& angle1, const float& angle2, const float& alpha) { auto angleDiff = angle2-angle1; angleDiff = std::fmod(angleDiff, 2*std::M_PI); angleDiff = std::fmod(angleDiff + 3*std::M_PI, 2*std::M_PI)-std::M_PI; return angle1+alpha*angleDiff; } (12/13/2016) combining several comments: If you insist on using exactly this interface structure, then you can get a singularity free method as follows: inline b2Vec2 slerp(const b2Vec2& v1, const b2Vec2& v2, const float& alpha) { auto angle = std::atan2(v1.x*v2.y - v1.y*v2.x, v1.x*v2.x + v1.y*v2.y); auto angleAlpha = angle * alpha; auto v3=b2Vec2{-v1.y, v1.x}; // rotation by 90° return std::cos(angleAlpha)*v1 + std::sin(angleAlpha)*v3; }
Return my struct as an array in C++, OPENGL
I have a problem with my function. I can't seem to make it return an array of a struct. Here is the MyApp.h header file: struct Vertex { glm::vec3 p; glm::vec3 c; }; class CMyApp { public: CMyApp(void); ~CMyApp(void); Vertex[] DrawCircle(int cx, int cy); ... It underlines the DrawCircle and "expects a ';'". Here is the MyApp.cpp (Of course, header included): Vertex[] CMyApp::DrawCircle(int cx, int cy) { Vertex result[42]; result[0] = { glm::vec3((float)cx, (float)cy, 0.0), glm::normalize(glm::vec3(0, 0, 1)) }; for (int ii = 1; ii < 21; ii++) { float theta = 2.0f * 3.1415926f * float(ii) / float(20); float x = 0.5 * cosf(theta); float y = 0.5 * sinf(theta); result[ii].p = glm::vec3(x, y, 0.0); result[ii].c = glm::normalize(result[ii].p); } result[21] = { glm::vec3((float)cx, (float)cy, 2.0), glm::normalize(glm::vec3(0, 0, 1.0)) }; for (int ii = 22; ii < 42; ii++) { float theta = 2.0f * 3.1415926f * float(ii) / float(20); float x = 0.5 * cosf(theta); float y = 0.5 * sinf(theta); result[ii].p = glm::vec3(x, y, 2.0); result[ii].c = glm::normalize(result[ii].p); } return result; } Same underline here under the function name's DrawCircle for expected ";". If I remove the array marks then the only error is the return statement. I want to return as an array tho. Thanks for help in advance.
You cannot return a local array. Such an array is allocated on the stack; when the function returns, all its content is available for other stack variables. If you use it after the call, its content is likely to be corrupted. So Vertex[] CMyApp::DrawCircle(int cx, int cy) { Vertex result[42]; return result; } is undefined behavior for the compiler. You should use a vector instead. Its move constructor makes it efficient for returning many results organized as an array. std::vector<Vertex> CMyApp::DrawCircle(int cx, int cy) { std::vector<Vertex> result; result.reserve(42); // same content than your original code. ... return result; } Note that if you declare class CMyApp { public: CMyApp(void); ~CMyApp(void); typedef Vertex ArrayOfVertices[]; ArrayOfVertices DrawCircle(int cx, int cy); }; You obtain the error message: error: ‘DrawCircle’ declared as function returning an array ArrayOfVertices DrawCircle(int cx, int cy);
I want to return as an array tho. You can't. C and C++ don't allow it. What you can do in C++ is returning a std::vector which you should use instead of plain arrays anyway.
Multiply a Matrix by a Vector and return a Vector
Ok, i'm hoping someone can help me out here. I'm trying to multiply a Vector3 by a Matrix4 technically... My FMatrix struct FMatrix { struct FPlane XPlane; struct FPlane YPlane; struct FPlane ZPlane; struct FPlane WPlane; }; So Each FPlane has X,Y,Z,W Making them FMatrix.XPlane.X FMatrix.XPlane.Y etc etc Here's the code I have so far and I understand that my Vector3 technically needs transformed to a Vector4 using 1 or 0 for my W. FVector2D get2dPoint(FVector worldloc) { FMatrix mat = mHUD->ViewProjectionMatrix; FVector res; float x = worldloc.X; float y = worldloc.Y; float z = worldloc.Z; res.X = (mat.XPlane.X*x) + (mat.YPlane.X*y) + (mat.ZPlane.X*z) + (mat.WPlane.X * 1); res.Y = (mat.XPlane.Y*x) + (mat.YPlane.Y*y) + (mat.ZPlane.Y*z) + (mat.WPlane.Y * 1); res.Z = (mat.XPlane.Z*x) + (mat.YPlane.Z*y) + (mat.ZPlane.Z*z) + (mat.WPlane.Z * 1); float winX = round(((res.X + 1.0f) / 2.0f) * mHUD->uiResolution.X); float winY = round(((1.0f - res.Y) / 2.0f) * mHUD->uiResolution.Y); return{ winX, winY }; } As I'm sure most of you can see I'm trying to make a world to screen function. I am using boost library and I saw something about ublas and matrix's but I definitely got lost in that, but if that helps you to help me I'm cool with it. So either an example with normal c++ or using boost functions is fine with me.