I have a large array (image) and I need to do many small configurable computations on that data.
I'll post an example here.
NOTE: This is not the actual problem, but a minimal / hopefully illustrative example of what I need to do.
// different functions that can be called based on the configuration
float func1( float* a )
{
return (*a) * (*a);
}
float func2( float* a )
{
return (*a) + (*a);
}
float func3( float* a )
{
return 2 * (*a) * (*a);
}
// my data
float* data = new float[1024*1024];
// function that manages the configurations
int main( )
{
int param1 = 0;
int param2 = 1;
for ( int k = 0; k < 1024*1024; k++ )
{
if ( param1 == 2 && param2 == 0 )
data[k] = func1( data + k );
else if ( param1 == 1 && param2 == 1 )
data[k] = func2( data + k );
else if ( param1 == 0 && param2 == 1 )
data[k] = func3( data + k );
else
continue;
}
}
In my code, it does not make sense to put the loop inside of each function.
However, param1 and param2 remain constant during the loop and they are known at compile time.
Is there a way to remove the influence of the if/elseif statements?
You can move your if-else statement that selects appropriate function to use out of the loop, thus getting to:
#include <functional>
// different functions that can be called based on the configuration
float func1( float* a )
{
return (*a) * (*a);
}
float func2( float* a )
{
return (*a) + (*a);
}
float func3( float* a )
{
return 2 * (*a) * (*a);
}
// my data
float* data = new float[1024*1024];
// function that manages the configurations
int main( )
{
int param1 = 0;
int param2 = 1;
std::function< float( float* )> functionToUse = nullptr;
if ( param1 == 2 && param2 == 0 )
functionToUse = std::function<float(float*)>(func1);
else if ( param1 == 1 && param2 == 1 )
functionToUse = std::function<float(float*)>(func2);
else if ( param1 == 0 && param2 == 1 )
functionToUse = std::function<float(float*)>(func3);
if(functionToUse){
for ( int k = 0; k < 1024*1024; k++ )
{
data[k] = functionToUse( data + k );
}
}
}
As to choosing the function to use during compilation time I'd suggest checking out this question:
if/else at compile time?
Also this question might be of interest:
Is cutting if statements by using function pointers going to be more efficient?
As long as the parameters are const OR the compiler can 100% determine that they variables aren't aliased and thus won't change (harder for the compiler) I would completely expect the optimizer to totally remove the runtime branch and do all the work at compile time.
If however you don't prefer to rely on the optimizer you can use templates:
template <int c1, int c2>
float func(float* a)
{
// No-op.
}
template <>
float func<2, 0>(float* a)
{
return (*a) * (*a);
}
template <>
float func<1, 1>(float* a)
{
return (*a) + (*a);
}
template <>
float func<0, 1>(float* a)
{
return 2 * (*a) * (*a);
}
int main()
{
const int param1 = 0;
const int param2 = 1;
for ( int k = 0; k < 1024*1024; k++ )
{
func<param1, param2>(<float ptr>);
}
}
Maybe something like this
#include <iostream>
#include <map>
#include <functional>
#include <utility>
typedef std::pair<size_t, size_t> pair;
typedef std::map< pair, std::function<float( float* )>> map;
// different functions that can be called based on the configuration
float func1( float* a )
{
return ( *a ) * ( *a );
}
float func2( float* a )
{
return ( *a ) + ( *a );
}
float func3( float* a )
{
return 2 * ( *a ) * ( *a );
}
// my data
float* data = new float[1024 * 1024];
void init( map &myMap )
{
myMap.insert( pair, std::function<float( float* )>>
( pair( 2, 0 ), std::function< float( float* )>( func1 ) ) );
myMap.insert( pair, std::function<float( float* )>>
( pair( 1, 1 ), std::function< float( float* )>( func2 ) ) );
myMap.insert( pair, std::function<float( float* )>>
( pair( 0, 2 ), std::function< float( float* )>( func3 ) ) );
}
// function that manages the configurations
int main( )
{
int param1 = 0;
int param2 = 1;
map myMap;
init( myMap );
for( int k = 0; k < 1024 * 1024; k++ )
{
data[k] = myMap[pair( param1, param2 )]( data + k );
}
}
Related
I got two functions:
The add_cpu function works fine, but the add_gpu function does not.
I tried to check sum options on my GPU driver Software and read my code over and over again. I tried the exact same code on an other machine and it worked fine.
The checkError result on current machine is 1, what it shouldn't be.
And checkError result on my Laptop is 0, what is correct.
Does anyone have any suggestion of what is the problem with the graphic card or the system?
I have no clue what's the problem here.
Did I miss some sort of option?
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <math.h>
#define out std::cout <<
#define end << std::endl
__global__
void add_gpu( int n, float* x, float* y ) {
for ( int i = 0; i < n; i++ ) y[i] = x[i] + y[i];
}
void add_cpu( int n, float* x, float* y ) {
for ( int i = 0; i < n; i++ ) y[i] = x[i] + y[i];
}
void init( int n, float* x, float* y ) {
for ( int i = 0; i < n; i++ ) {
x[i] = 1.0f;
y[i] = 2.0f;
}
}
int checkError( int n, float f, float* y ) {
float c = 0.0f;
for ( int i = 0; i < n; i++ ) c = fmax( c, fabs( y[i] - f ) );
return c;
}
void print( int n, float* obj, char* str = "obj: " ) {
out str << obj[0];
for ( int i = 1; i < n; i++ ) out ", " << obj[i];
out "" end;
}
int main( ) {
int n = 1 << 5;
float* x, * y;
float error = 0.0f;
cudaMallocManaged( &x, n * sizeof( float ) );
cudaMallocManaged( &y, n * sizeof( float ) );
init( n, x, y );
print( n, x, "x" );
print( n, y, "y" );
add_gpu<< <1, 1 >> > ( n, x, y );
//add_cpu(n, x, y);
cudaDeviceSynchronize( );
print( n, y, "y" );
error = checkError( n, 3.0f, y );
out "error: " << error end;
cudaFree( x );
cudaFree( y );
return 0;
}
I don't see exactly where the problem is but in order to debug it you should check the cuda errors.
Most cuda functions return a cuda status. You can maybe use a little wrapper function like this to check the errors
checkCudaError(const cudaError_t error) {
if (error != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(error) << std::endl;
// maybe do something else
}
}
and call function like cudaMallocManaged() this way
checkCudaError(cudaMallocManaged(&x, n * sizeof(float));
For all operations which are performed on the device (like custom kernels) you should run the kernel and after that call
cudaGetLastError()
and maybe also use checkCudaError()
checkCudaError(cudaGetLastError())
Note that cudaGetLastError() will always return a error if at some point an error occured and so you have to find the place where the first error occures. That is why you should check cuda error every time the GPU was used in some way.
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gc263dbe6574220cc776b45438fc351e8
Without copying the data to the device your GPU doesnt know the data and without copying them back your host doesnt know the results
I want to compute a general matrix matrix product
C = C + alpha*A*B
Now, the special situation in my context is that I know the number of columns of A (resp. the number of rows of B) at compile time. Therefore, I can hard-code this information, which naturally yields a significant performance boost.
Up to now, I have done this in Fortran, which works, but requires me to re-write the same code over and over again for all possible scenarios. Therefore, I would like to write one C++ version of this function, in which I pass the size information at compile time.
After a bit of trying, I finally arrived at the following solution
template<size_t sizeA2>
void matrixMatrixProduct( double* __restrict__ pC,
double const* __restrict__ pA,
double const* __restrict__ pB,
double alpha,
size_t sizeA1,
size_t sizeB2 )
{
size_t outerLoopLimit = sizeA1;
size_t innerLoopLimit = sizeB2;
size_t sizeC2 = sizeB2;
for ( size_t i = 0; i < outerLoopLimit; ++i )
{
#pragma vector aligned
#pragma ivdep
for ( size_t j = 0; j < innerLoopLimit; ++j )
{
#pragma vector aligned
#pragma ivdep
for ( size_t k = 0; k < sizeA2; ++k )
{
pC[i * sizeC2 + j] += alpha * pA[i * sizeA2 + k] * pB[k * sizeB2 + j];
} // end of k-loop
} // end of j-loop
} // end of i-loop
}
Adding the restrict keyword and the two pragmas, I could convince the Intel Cpp compiler to vectorize the j-loop so that I got a maximum speed of about 19 GFlops on a single core of a Intel(R) Core(TM) i7-4790 CPU # 3.60GHz, which is in perfect agreement with the Fortran counter-part.
What I now want to do is extend the functionality, such that I can transpose the three matrices A, B, and C individually. To this end, my plan was to add a template access policy which flips the two indices if necessary. I extended the code as follows
template<size_t sizeA2,
typename AccessOperatorA = NoTranspose,
typename AccessOperatorB = NoTranspose,
typename AccessOperatorC = NoTranspose >
void matrixMatrixProduct( double* __restrict__ pC,
double const* __restrict__ pA,
double const* __restrict__ pB,
double alpha,
size_t sizeA1,
size_t sizeB2 )
{
size_t outerLoopLimit = sizeA1;
size_t innerLoopLimit = sizeB2;
size_t sizeC2 = sizeB2;
for ( size_t i = 0; i < outerLoopLimit; ++i )
{
#pragma vector aligned
#pragma ivdep
for ( size_t j = 0; j < innerLoopLimit; ++j )
{
#pragma vector aligned
#pragma ivdep
for ( size_t k = 0; k < sizeA2; ++k )
{
AccessOperatorC::get( pC, sizeC2, i, j ) += alpha * AccessOperatorA::get( pA, sizeA2, i, k ) * AccessOperatorB::get( pB, sizeB2, k, j );
} // end of k-loop
} // end of j-loop
} // end of i-loop
}
with the NoTranspose access operator being defined as
struct NoTranspose
{
template<typename DataType>
static inline DataType& get( DataType* __restrict__ pointer,
size_t size2,
size_t i,
size_t j )
{
return pointer[i * size2 + j];
}
};
In principle, this code compiles and give the correct answers, but at only 60% of the speed!
For my understanding, the problems seems to be that the alignment information is not carried over into the get function of the access template, although the get function is in-lined according the the perf. report. However, the icpc seems to decide to work on xmm registers only, which causes the performance penalty.
Therefore, my question is: how can get the compiler to correctly vetorize the extended code?
Any help is gratefully acknowledged.
For completeness, I add the complete MWE
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <vector>
#include <chrono>
#include <cmath>
struct NoTranspose
{
template<typename DataType>
static inline DataType& get( DataType* __restrict__ pointer,
size_t size2,
size_t i,
size_t j )
{
return pointer[i * size2 + j];
}
};
template<size_t sizeA2,
typename AccessOperatorA = NoTranspose,
typename AccessOperatorB = NoTranspose,
typename AccessOperatorC = NoTranspose >
void matrixMatrixProduct( double* __restrict__ pC,
double const* __restrict__ pA,
double const* __restrict__ pB,
double alpha,
size_t sizeA1,
size_t sizeB2 )
{
size_t outerLoopLimit = sizeA1;
size_t innerLoopLimit = sizeB2;
size_t sizeC2 = sizeB2;
for ( size_t i = 0; i < outerLoopLimit; ++i )
{
#pragma vector aligned
#pragma ivdep
for ( size_t j = 0; j < innerLoopLimit; ++j )
{
#pragma vector aligned
#pragma ivdep
for ( size_t k = 0; k < sizeA2; ++k )
{
AccessOperatorC::get( pC, sizeC2, i, j ) += alpha * AccessOperatorA::get( pA, sizeA2, i, k ) * AccessOperatorB::get( pB, sizeB2, k, j );
} // end of k-loop
} // end of j-loop
} // end of i-loop
}
int main( void )
{
std::vector<int> sizesA1 = { 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, 500, 600, 700, 800, 900, 1000 };
printf( "%15s \t %15s \t%15s \t %15s\n", "sizeA1", "sizeA2", "Time [s]", "GFlops" );
for ( const auto & sizeA1 : sizesA1 )
{
int numberOfIterations = 1e6;
const int sizeA2 = 2;
size_t sizeB2 = sizeA1;
size_t sizeC1 = sizeA1;
size_t sizeC2 = sizeB2;
size_t lengthA = sizeA1 * sizeA2;
size_t lengthB = sizeA2 * sizeB2;
int lengthC = sizeC1 * sizeC2;
std::vector<double> A1( lengthA, 1.234 );
std::vector<double> B1( lengthB, 1.234 );
std::vector<double> C1( lengthC, 1.234 );
std::vector<double> A2( lengthA, 1.234 );
std::vector<double> B2( lengthB, 1.234 );
std::vector<double> C2( lengthC, 1.234 );
double alpha = 1.234;
auto start = std::chrono::high_resolution_clock::now( );
for ( int i = 0; i < numberOfIterations; ++i )
{
double* pA;
double* pB;
double* pC;
//Force cache reload
if ( i % 2 )
{
pA = A1.data( );
pB = B1.data( );
pC = C1.data( );
}
else
{
pA = A2.data( );
pB = B2.data( );
pC = C2.data( );
}
matrixMatrixProduct<sizeA2>( pC, pA, pB, alpha, sizeA1, sizeB2 );
}
auto end = std::chrono::high_resolution_clock::now( );
std::chrono::duration<double> elapsed = end - start;
double numberOfFlops = 1.0 * numberOfIterations * lengthC * ( 3 + 2 ); //two adds and three mult!
double flops = (double) numberOfFlops / ( elapsed.count( ) );
printf( "%15d \t %15d \t %15g \t %15e\n", sizeA1, sizeA2, elapsed.count( ), flops / ( 1.0e9 ) );
}
return 0;
}
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 6 years ago.
Improve this question
I'm trying to check how much time passes with each 3 solutions for a problem, but sometimes I get a runtime error and can't see the passed time for 3rd solution, but sometimes it works. I think the solutions.h file is correct but i put it here anyway.
#include <iostream>
#include <cstdlib>
#include <ctime>
#include "solutions.h"
using namespace std;
int main()
{
cout << "Hello world!" << endl;
int* input1 = new int[10000];
int* input2 = new int[20000];
int* input3 = new int[40000];
int* input4 = new int[80000];
int* input5 = new int[100000];
for(int i = 0; i<100000; i++)
{
input1[i]= rand();
input2[i]= rand();
input3[i]= rand();
input4[i]= rand();
input5[i]= rand();
}
int* output1= new int[1000];
double duration;
clock_t startTime1 = clock();
solution1(input1,10000,1000,output1);
duration = 1000 * double( clock() - startTime1 ) / CLOCKS_PER_SEC;
cout << "Solution 1 with 10000 inputs took " << duration << " milliseconds." << endl;
startTime1 = clock();
solution2(input1,10000,1000,output1);
duration = 1000 * double( clock() - startTime1 ) / CLOCKS_PER_SEC;
cout << "Solution 2 with 10000 inputs took " << duration<< " milliseconds." << endl;
startTime1 = clock();
solution3(input1,10000,1000,output1);
duration = 1000 * double( clock() - startTime1 ) / CLOCKS_PER_SEC;
cout << "Solution 3 with 10000 inputs took " << duration << " milliseconds." << endl<<endl<<endl;
return 0;
}
And the solutions.h is
#ifndef SOLUTIONS_H_INCLUDED
#define SOLUTIONS_H_INCLUDED
#include <cmath>
void solution1( int input[], const int n, const int k, int output[] );
void solution2( int input[], const int n, const int k, int output[] );
void solution3( int input[], const int n, const int k, int output[] );
void swap( int &n1, int &n2 ) {
int temp = n1;
n1 = n2;
n2 = temp;
}
void solution1( int input[], const int n, const int k, int output[] ) {
int maxIndex, maxValue;
for( int i = 0; i < k; i++ ) {
maxIndex = i;
maxValue = input[i];
for( int j = i+1; j < n; j++ ) {
if( input[j] >= maxValue ) {
maxIndex = j;
maxValue = input[ j ];
}
}
swap( input[i], input[maxIndex] );
output[i] = input[i];
}
}
int partition( int input[], int p, int r ) {
int x = input[ r ], i = p - 1;
for( int j = p; j < r; j++ ) {
if( input[ j ] >= x ) {
i = i + 1;
swap( input[i], input[j] );
}
}
swap( input[i+1], input[r] );
return i + 1;
}
void quickSort( int input[], int p, int r ) {
int q;
if( p < r ) {
q = partition( input, p, r );
quickSort( input, p, q - 1 );
quickSort( input, q + 1, r );
}
}
void solution2( int input[], const int n, const int k, int output[] ) {
quickSort( input, 0, n - 1 );
for( int i = 0; i < k; i++ ) {
output[i] = input[i];
}
}
int partition2( int input[], int a, int p, int r ) {
int x = a, i = p - 1;
for( int j = p; j < r; j++ ) {
if( input[ j ] == x ) {
swap( input[ j ], input[ r ] );
}
if( input[ j ] >= x ) {
i = i + 1;
swap( input[i], input[j] );
}
}
swap( input[ i + 1 ], input[ r ] );
return i + 1;
}
void quickSort2( int input[], int p, int r ) {
int q;
if( p < r ) {
q = partition2( input, input[ r ], p, r );
quickSort2( input, p, q - 1 );
quickSort2( input, q + 1, r );
}
}
int findMin( int n1, int n2 ) {
if( n1 <= n2 )
return n1;
else
return n2;
}
int select( int input[], int n, int k, int start, int end, int flag ) {
if( n <= 5 ) {
quickSort2( input, start, end );
return input[ start + k - 1 ];
}
int i = start, numGroups = (int) ceil( ( double ) n / 5 ), numElements, j = 0;
int *medians = new int[numGroups];
while( i <= end ) {
numElements = findMin( 5, end - i + 1 );
medians[( i - start ) / 5] = select( input, numElements, (int) ceil( ( double ) numElements / 2 ), i, i + numElements - 1, 1 );
i = i + 5;
}
int M = select( medians, numGroups, (int) ceil( ( double ) numGroups / 2 ), 0, numGroups - 1, 1 );
delete[] medians;
if( flag == 1 )
return M;
int q = partition2( input, M, start, end );
int m = q - start + 1;
if( k == m )
return M;
else if( k < m )
return select( input, m - 1, k, start, q - 1, 0 );
else
return select( input, end - q, k - m, q + 1, end, 0 );
}
void solution3( int input[], const int n, const int k, int output[] ) {
select( input, n, k, 0, n - 1, 0 );
for( int i = 0; i < k; i++ )
output[i] = input[i];
}
#endif // SOLUTIONS_H_INCLUDED
Building your program with address sanitizer (clang++ clock.cxx -std=c++11 -O1 -g -fsanitize=address -fno-omit-frame-pointer) reveals the problem:
$ ./a.out
Hello world!
=================================================================
==8175==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x62e00000a040 at pc 0x000104dbd912 bp 0x7fff5ae43970 sp 0x7fff5ae43968
WRITE of size 4 at 0x62e00000a040 thread T0
#0 0x104dbd911 in main clock.cxx:18
#1 0x7fff88cd85fc in start (libdyld.dylib+0x35fc)
#2 0x0 (<unknown module>)
0x62e00000a040 is located 0 bytes to the right of 40000-byte region [0x62e000000400,0x62e00000a040)
And there is your code:
int* input1 = new int[10000];
int* input2 = new int[20000];
int* input3 = new int[40000];
int* input4 = new int[80000];
int* input5 = new int[100000];
for(int i = 0; i<100000; i++)
{
input1[i]= rand();
input2[i]= rand();
input3[i]= rand();
input4[i]= rand();
input5[i]= rand();
}
As you can see, size of input1, input2, ..., input4 is 10K, 20K, 40K, 80K elements, but in the loop we are accessing to elements out of this array so this can lead to the heap corruption.
Process returned -1073741819 (0xC0000005)
This means "memory access violation" or SEGFAULT.
Hope this will help.
I have template class for spatial indexing which by default should work for any 2d object which implements function void boundingBox( Rect2d * box ) using std::vector<OBJECT*> as container of objects inserted in particular grid tile.
template <class OBJECT, class TILE = std::vector<OBJECT*> >
class GridMap2D {
public:
double step, invStep;
int nx, ny, nxy;
TILE * tiles;
// ==== functions
inline int getIx( double x ){ return (int)( invStep * x ); };
inline int getIy( double y ){ return (int)( invStep * y ); };
inline double getX( int ix ){ return step * ix ; };
inline double getY( int iy ){ return step * iy ; };
inline int getIndex ( int ix, int iy ){ return nx*iy + ix; };
inline int getIndex ( double x, double y ){ return getIndex( getIx(x), getIy(y) ); };
inline TILE* getTile( double x, double y ){ return tiles + getIndex( x, y ); };
inline void insert( OBJECT* p, int i ){
tiles[ i ].push_back( p );
}
inline void insert( OBJECT* p, int ix, int iy ){ insert( p, getIndex( ix,iy ) ); };
// this is very general method to insert any object with bounding box
// but for many object it is not very efficient
// some objects suchb as Point2d does not even implement boundingBox()
inline void insert( OBJECT* p ){
Rect2d bbox;
p.boundingBox( &bbox );
int ix0 = getIx( bbox.x0 ); // TODO: bound check ?
int iy0 = getIy( bbox.y0 );
int ix1 = getIx( bbox.x1 );
int iy1 = getIy( bbox.y1 );
for( int iy=iy0; iy<=iy1; iy++ ){
for( int ix=ix0; ix<=ix1; ix++ ){
insert( p, ix, iy );
}
}
}
void init( int nx_, int ny_, double step_, int tile_n0 ){
step = step_;
invStep = 1/step;
nx = nx_; ny=ny_;
nxy = nx*ny;
tiles = new TILE[ nxy ];
for (int i=0; i<nxy; i++){
if ( tile_n0 != 0 ){
tiles[i].reserve( tile_n0 );
}
}
}
};
And its specialization for Segment2d which does not implement boundingBox() but has own insert algorithm based on line rasterization:
template<> class GridMap2D< Segment2d, std::vector<Segment2d*> >{
public:
inline void insert( Segment2d* l ){
Vec2d* a = l->a;
Vec2d* b = l->b;
double ax = a->x;
double ay = a->y;
double bx = b->x;
double by = b->y;
double dx = fabs( bx - ax );
double dy = fabs( by - ay );
int dix = ( ax < bx ) ? 1 : -1;
int diy = ( ay < by ) ? 1 : -1;
int ix = getIx( ax );
int iy = getIy( ay );
int ixb = getIx( bx );
int iyb = getIy( by );
double x=0, y=0;
int i=0;
insert( l, ix, iy );
insert( l, ixb, iyb );
while ( ( ix != ixb ) && ( iy != iyb ) ) {
if ( x < y ) {
x += dy;
ix += dix;
} else {
y += dx;
iy += diy;
}
insert( l, ix, iy );
}
}
};
which will insert line into grid tiles trough which it goes ... like this:
but I have several problems with how templates work:
In the specialization for Segment2d I got error: ‘getIx’ was not declared in this scope. Does it mean that the specialized template does not know functions defined in base template ? Or I probably do the specialization wrongly. I really do not want to rewrite the code several times, then the template approach would be pointless.
I'm not sure what happen when I instantiate or specialize the template by some parameter which does not implement some methods which the base template use. e.g.
consider I use different container type argument as TILE which does not implement .push_back()
my Segment2d does not implement boundingBox()
can does the specialization solve this problem ?
background:
I have two goals:
I want to create very fast spatial indexing for acceleration of ray-casting and collisions for any 2d shape.
Because it should be as fast as possible I want o use templates (resolved in compile-time ) rather than some class inherience hierarchy with virtual methods.
I want to learn how to use templates effectively
This question is related to this " Generic Quadtree ", where is recommendation to use templates for similar task. Tried to implement that ... but I perhaps my understanding of templates is not good enough.
NOTE: my GridMap2d is not a QuadTree, but I still added QuadTree as an keyword, because the question is relavant to it. QuadTree is very common spatial indexing data-structure, and implementing it using templates would have the same issue.
"I really do not want to rewrite the code several times, then the template approach would be pointless."
When you specialize a class, you do not "inherit" any of the member fields or methods. So you need some special tricks here to get what you want.
What you can do instead is to essentially move the behavior of your insert() method into a separate template class. That way, when you specialize that class's behavior, you don't end up clobbering your other fields and methods. This requires some clever restructuring of your code.
This code I believe should do the job:
struct GridMap2D_base {
double step, invStep;
int nx, ny, nxy;
int getIx( double x ) const { return (int)( invStep * x ); };
int getIy( double y ) const { return (int)( invStep * y ); };
double getX( int ix ) const { return step * ix ; };
double getY( int iy ) const { return step * iy ; };
int getIndex ( int ix, int iy ) const { return nx*iy + ix; };
int getIndex ( double x, double y ) const { return getIndex( getIx(x), getIy(y) ); };
};
struct PushBackHelper {
// add versions of this for std::list, etc., as needed
template <typename OBJECT>
static void push_back(std::vector<OBJECT*>& tile, OBJECT* p) {
tile.push_back(p);
}
};
template<typename OBJECT>
struct InsertAlgorithm {
int ix0, iy0, ix1, iy1, ix, iy;
InsertAlgorithm(const GridMap2D_base& base, OBJECT* p) {
Rect2d bbox;
p->boundingBox( &bbox );
ix0 = base.getIx( bbox.x0 ); // TODO: bound check ?
iy0 = base.getIy( bbox.y0 );
ix1 = base.getIx( bbox.x1 );
iy1 = base.getIy( bbox.y1 );
ix = 0;
iy = 0;
}
bool should_preinsert1(const GridMap2D_base& base, int& ix2, int& iy2) { return false; }
bool should_preinsert2(const GridMap2D_base& base, int& ix2, int& iy2) { return false; }
bool should_insert(const GridMap2D_base& base, int& ix2, int& iy2)
{
while (ix<=ix1) {
ix2 = ix;
iy2 = iy;
ix++;
return true;
}
iy++;
if (iy>iy1) return false;
ix = 0;
return should_insert(base, ix2, iy2);
}
};
template<> struct InsertAlgorithm<Segment2d> {
Vec2d* a;
Vec2d* b;
double ax, ay, bx, by, dx, dy, x, y;
int dix, diy, ix, iy, ixb, iyb;
InsertAlgorithm(const GridMap2D_base& base, Segment2d* l) {
a = l->a;
b = l->b;
ax = a->x;
ay = a->y;
bx = b->x;
by = b->y;
x = 0;
y = 0;
dx = fabs( bx - ax );
dy = fabs( by - ay );
dix = ( ax < bx ) ? 1 : -1;
diy = ( ay < by ) ? 1 : -1;
ix = base.getIx( ax );
iy = base.getIy( ay );
ixb = base.getIx( bx );
iyb = base.getIy( by );
}
bool should_preinsert1(const GridMap2D_base& base, int& ix2, int& iy2) {
ix2 = ix;
iy2 = iy;
return true;
}
bool should_preinsert2(const GridMap2D_base& base, int& ix2, int& iy2) {
ix2 = ixb;
iy2 = iyb;
return true;
}
bool should_insert(const GridMap2D_base& base, int& ix2, int& iy2)
{
if (ix==ixb && iy==iyb) return false;
if ( x < y ) {
x += dy;
ix += dix;
} else {
y += dx;
iy += diy;
}
ix2 = ix;
iy2 = iy;
return true;
}
};
template<class OBJECT, typename TILE=std::vector<OBJECT*> >
class GridMap2D : public GridMap2D_base {
public:
TILE* tiles;
TILE* getTile( double x, double y ){ return tiles + getIndex( x, y ); };
void insert( OBJECT* p ){
InsertAlgorithm<OBJECT> algo(*this, p);
int ix = 0;
int iy = 0;
if (algo.should_preinsert1(*this, ix, iy)) {
PushBackHelper::push_back(tiles[getIndex(ix, iy)], p);
}
if (algo.should_preinsert2(*this, ix, iy)) {
PushBackHelper::push_back(tiles[getIndex(ix, iy)], p);
}
while (algo.should_insert(*this, ix, iy)) {
PushBackHelper::push_back(tiles[getIndex(ix, iy)], p);
}
}
void init( int nx_, int ny_, double step_, int tile_n0 ){ ... }
};
Btw, the inline keyword has no effect when used within the class declaration.
I am attempting to create a Worley Noise function. I looked around and read the original paper by Steven Worley and wrote my implementation. The output does not appear to be correct though. I expect to see something like this.
But it outputs this.
I'm not sure what I am doing wrong. I have looked online at other peoples programs and I do it the same way. Here is the code.
float WorleyNoise::noise( Vector3 input ) {
unsigned int lastRandom;
unsigned int numberFeaturePoints;
Vector3 randomDiff;
Vector3 featurePoint;
int cubeX;
int cubeY;
int cubeZ;
float distanceArray[ 3 ];
for ( int i = 0; i < 3; i++ ) {
distanceArray[ i ] = 6666.0f;
}
int evalX = ( int ) floorf( input.m_x );
int evalY = ( int ) floorf( input.m_y );
int evalZ = ( int ) floorf( input.m_z );
for ( int i = -1; i < 2; ++i ) {
for ( int j = -1; j < 2; ++j ) {
for ( int k = -1; k < 2; ++k ) {
cubeX = evalX + i;
cubeY = evalY + j;
cubeZ = evalZ + k;
lastRandom = lcg_random( hash( ( unsigned int ) ( cubeX + m_seed ), ( unsigned int ) cubeY, ( unsigned int ) cubeZ ) );
numberFeaturePoints = lookup( lastRandom );
for ( unsigned int l = 0; l < numberFeaturePoints; ++l ) {
lastRandom = lcg_random( lastRandom );
randomDiff.m_x = ( float ) lastRandom / 0x100000000;
lastRandom = lcg_random( lastRandom );
randomDiff.m_y = ( float ) lastRandom / 0x100000000;
lastRandom = lcg_random( lastRandom );
randomDiff.m_z = ( float ) lastRandom / 0x100000000;
featurePoint.m_x = randomDiff.m_x + ( float ) cubeX;
featurePoint.m_y = randomDiff.m_y + ( float ) cubeY;
featurePoint.m_z = randomDiff.m_z + ( float ) cubeZ;
insert( distanceArray, euclidian_distance( input, featurePoint ) );
}
}
}
}
return Utility::clampf( combine_function_1( distanceArray ), 0.0f, 1.0f );
}
unsigned int WorleyNoise::hash( unsigned int i, unsigned int j, unsigned int k ) {
return ( unsigned int ) ( ( ( ( ( ( OFFSET_BASIS ^ ( unsigned int ) i ) * FNV_PRIME ) ^ ( unsigned int ) j ) * FNV_PRIME ) ^ ( unsigned int ) k ) * FNV_PRIME );
}
unsigned int WorleyNoise::lcg_random( unsigned int last ) {
return ( unsigned int ) ( ( 1103515245 * last + 12345 ) % 0x100000000 );
}
void WorleyNoise::insert( float arr[] , float value ) {
float temp = 0.0f;
for ( int i = 2; i >= 0; i-- ) {
if ( value > arr[ i ] ) {
break;
}
temp = arr[ i ];
arr[ i ] = value;
if ( i + 1 < 3 ) {
arr[ i + 1 ] = temp;
}
}
}
unsigned int WorleyNoise::lookup( unsigned int value ) {
value = value & 0xffffffff;
if ( value < 393325350 )
return 1;
if ( value < 1022645910 )
return 2;
if ( value < 1861739990 )
return 3;
if ( value < 2700834071 )
return 4;
if ( value < 3372109335 )
return 5;
if ( value < 3819626178 )
return 6;
if ( value < 4075350088 )
return 7;
if ( value < 4203212043 )
return 8;
return 9;
}
float WorleyNoise::euclidian_distance( Vector3 p1, Vector3 p2 ) {
return ( p1.m_x - p2.m_x ) * ( p1.m_x - p2.m_x ) + ( p1.m_y - p2.m_y ) * ( p1.m_y - p2.m_y ) + ( p1.m_z - p2.m_z ) * ( p1.m_z - p2.m_z );
}
float WorleyNoise::combine_function_1( float arr[] ) {
return arr[ 0 ];
}
I then output the results like so with pngwriter.
void WorleyNoise::to_png( const std::string &file, unsigned int width, unsigned int height ) {
pngwriter png( width, height, 0, file.c_str() );
for ( unsigned int i = 0; i < width; i++ ) {
for ( unsigned int j = 0; j < height; j++ ) {
float value = noise( Vector3( i, j, 0 ) );
png.plot( i, j, ( double ) value, ( double ) value, ( double ) value );
}
}
png.close();
}
So what is going wrong here? I've looked all over, especially at this tutorial and cannot figure out why it does not output correctly. Any help is greatly appreciated, thanks.