I am using Cuda with C++ to do some parallel computing. Recently, I noticed something that I cannot understand and I didn't find informations about it when looking for it. In my code, one line which is very seldom exectued (but needed) slows down the program even when it is not executed at all. Here is some code to make it more clear:
The class I created:
class Foo
{
void myFunction(Foo *listFoo);
//some other functions that I need
...
int myAttribute;
//some other attributes that I need
...
}
The definition of myFunction:
void Foo::myFunction(Foo *listFoo)
{
//do some computations on the listFoo
if( condition seldom verified )
{ myAttribute = myAttribute + 1; }
}
The global function:
__global__ void compute(Foo *listFoo, int numberOfFoo)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if( i < numberOfFoo)
{ listFoo[i].myFunction(listFoo); }
}
The host code:
compute<<<(numberOfFoo + 511)/512, 512>>> (listFoo, numberOfFoo)
The line slowing down everything is myAttribute = myAttribute + 1. Even when it is executed 0 times, the code is really slow compared to when the line is put in the comment. I tried to replace this line with a simple printf. The result is the same, the line is never executed but it slows down everything.
If you have any suggestion on the reason and on eventually how to solve this problem, it would be very much appreciated. My level in programing is not so advanced, so please use relatively easy explanations.
Thanks a lot
First Edit: few people requested the code, so here it is! I reduced it to 700 lines, I know it is still very long but not much would work if I keep removing some parts of it. It compiles without problems for me. All you have to do is press enter, wait few seconds and the time taken will be shown in the command window.
It is in the function findContactwithGrain() that the problem occurs. The line addContact(grainContact) is slowing down everything. On my computer, if this line is active, one computation takes around 3.5 sec. If I put it in comment, it takes 0.07 sec. That's a huge difference for one line that is never executed.
Hope this helps to understand the problem
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <fstream> // to read and write files
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <string>
#include <sstream>
#define n 200
using namespace std;
int global_totalNumberBlock = 0;
int global_totalNumberGrain = 0;
//tell the compiler that those classes exist
class Vec3d2;
class Block;
class Grain;
class Contact;
class Analysis;
class Vec3d2
{
public:
__host__ __device__ Vec3d2(void);
__host__ __device__ Vec3d2(double x_value, double y_value, double z_value);
__host__ __device__ ~Vec3d2(void);
__host__ __device__ double dot(Vec3d2 a) const;
__host__ __device__ Vec3d2 cross(Vec3d2 a) const;
__host__ __device__ double norm() const;
__host__ __device__ void normalize();
// to be able to use cout easily
__host__ __device__ friend ostream & operator <<(ostream &s,const Vec3d2 &vec)
{
s << vec.x << endl;
s << vec.y << endl;
s << vec.z << endl;
return s;
}
//to be able to use brackets
__host__ __device__ double operator [](int i) const
{
if( i == 0)
{
return x;
}
else if( i == 1)
{
return y;
}
else if( i == 2)
{
return z;
}
else
{
cout << "ERROR IN USING VEC3D2" << endl;
system("PAUSE");
}
}
__host__ __device__ double & operator [](int i)
{
if( i == 0)
{
return x;
}
else if( i == 1)
{
return y;
}
else if( i == 2)
{
return z;
}
else
{
cout << "ERROR IN USING VEC3D2" << endl;
system("PAUSE");
}
}
//attributes
double x, y, z;
};
//Class Vec3d2 functions and operators
Vec3d2::Vec3d2()
{
x = 0;
y = 0;
z = 0;
}
Vec3d2::Vec3d2(double x_value, double y_value, double z_value)
{
x = x_value;
y = y_value;
z = z_value;
}
Vec3d2::~Vec3d2()
{
}
double Vec3d2::dot(Vec3d2 a) const
{
return x*a.x + y*a.y + z*a.z;
}
Vec3d2 Vec3d2::cross(Vec3d2 a) const
{
Vec3d2 result( y*a.z - z*a.y, x*a.z - z*a.x, x*a.y - y*a.x);
return result;
}
double Vec3d2::norm() const
{
return sqrt((double) x*x + y*y + z*z);
}
void Vec3d2::normalize()
{
double norm = this->norm();
if (norm > 0)
{
x = x/norm;
y = y/norm;
z = z/norm;
}
else //the vector has a null norm so nothing to do
{
}
}
__host__ __device__ Vec3d2 operator+(Vec3d2 const& a, Vec3d2 const& b)
{
return Vec3d2(a.x + b.x, a.y + b.y, a.z + b.z);
}
__host__ __device__ Vec3d2 operator-(Vec3d2 const& a, Vec3d2 const& b)
{
return Vec3d2(a.x - b.x, a.y - b.y, a.z - b.z);
}
__host__ __device__ Vec3d2 operator*(Vec3d2 const& a, double const& b)
{
return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator*(double const& b, Vec3d2 const& a)
{
return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator/(Vec3d2 const& a, double const& b)
{
return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ Vec3d2 operator/(double const& b, Vec3d2 const& a)
{
return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ bool operator==(Vec3d2 const& a, Vec3d2 const& b)
{
if(a.x == b.x && a.y == b.y && a.z == b.z)
{
return true;
}
else
{
return false;
}
}
__host__ __device__ bool operator!=(Vec3d2 const& a, Vec3d2 const& b)
{
if( a.x != b.x || a.y != b.y || a.z != b.z)
{
return true;
}
else
{
return false;
}
}
class Contact
{
public:
__host__ __device__ Contact(void);
//__host__ __device__ Contact(Contact const& ContactToCopy);
__host__ __device__ ~Contact(void);
__host__ __device__ void setContact(Grain &grain1, Grain &grain2, double overlap_value);
};
class Block
{
public:
__host__ Block(void);
__host__ Block(Block const& BlockToCopy);
__host__ __device__ ~Block(void);
__host__ __device__ Contact* getContactList() const;
__host__ __device__ Contact** getContactListPtr();
__host__ __device__ int getMaxNumberContact() const;
__host__ __device__ int getNumberContact() const;
__host__ __device__ void setContactList(Contact *ptr);
__host__ __device__ void addContact(Contact contact_value);
__host__ __device__ void clearContactList();// empty the contactList
__host__ __device__ void deleteBlockData(); //clear the memory taken by the contactList
__host__ __device__ Block& operator=(Block const& BlockToCopy);
protected:
int Id; //unique Id number for each entity double mass;
int totalNumberBlock; //same value for each block, cannot use static attribute because of cuda
Contact *contactList;
int numberContact, old_numberContact; //because there is no way to find it from the pointer contactList
int maxNumberContact; //maximum number of contact per block, we have to choose this
};
class Grain: public Block
{
public:
__host__ Grain(void);
__host__ Grain(Grain const& grainToCopy);
__host__ Grain(Vec3d2 position_value, double radius_value, double mass_value);
__host__ __device__ ~Grain(void);
__host__ __device__ Vec3d2 getPositionVec() const;
__host__ __device__ Vec3d2* getPosition() const;
__host__ __device__ Vec3d2** getPositionPtr();
__host__ __device__ int getTotalNumberGrain() const;
__host__ void setTotalNumberGrain();
__host__ __device__ void setTotalNumberGrain(int number);
__host__ __device__ void setPosition(Vec3d2 *ptr);
__host__ __device__ void setPositionVec(Vec3d2 position_value);
__host__ __device__ void deleteGrainData();
__host__ __device__ void findContactwithGrain(Grain *grainList);
__host__ __device__ Grain& operator=(Grain const& grainToCopy);
__host__ __device__ friend ostream & operator <<(ostream &s,const Grain &grain)
{
s <<"position is" << endl;
s << *grain.position << endl;
s <<"grain number is" << endl;
s << grain.number << endl;
s <<"radius is" << endl;
s << grain.radius << endl;
s <<"mass is" << endl;
return s;
}
private:
Vec3d2 *position;
int totalNumberGrain;
int number; //different from Id defined in class Block because a wall could have the same number as a grain
double radius;
};
class Analysis
{
public:
Analysis(void);
Analysis(Grain *grainList);
~Analysis(void);
Grain* getGrainList();
void copyToDevice();
void copyToHost();
void runAnalysis();
private:
//should contain grainList, wallList and their equivalent for the device
//should contain an array of pointers for each attribute being a pointer in grain and wall and their equivalent in the device
int totalNumberGrain, totalNumberWall;
Grain *grainList, *d_grainList;
//for grain data
Contact **grain_contactList, **d_grain_contactList;
Vec3d2 **grain_position, **d_grain_position;
};
//class Contact functions
Contact::Contact(void)
{
}
Contact::~Contact(void)
{
}
void Contact::setContact(Grain &grain1, Grain &grain2, double overlap_value)//we are in grain1 and contact with grain2
{
}
//class Block functions
Block::Block(void)
{
Id = global_totalNumberBlock;
numberContact = 0;
old_numberContact = 0;
//contact list settings
maxNumberContact = 30;
contactList = new Contact[maxNumberContact];
//increment of block number
global_totalNumberBlock = global_totalNumberBlock + 1;
}
Block::~Block(void)
{
delete[] contactList;
//cout << "CAREFUL, YOU ARE DESTROYING A BLOCK" << endl;//because we should never erase a block
//system("PAUSE");
totalNumberBlock = totalNumberBlock - 1;
}
Block::Block(Block const& BlockToCopy)
{
Id = BlockToCopy.Id;
numberContact = BlockToCopy.numberContact;
old_numberContact = BlockToCopy.old_numberContact;
maxNumberContact = BlockToCopy.maxNumberContact;
contactList = new Contact[maxNumberContact];
for(int i =0; i <numberContact; i++)
{
contactList[i] = BlockToCopy.contactList[i];
}
}
Contact* Block::getContactList() const
{
return contactList;
}
Contact** Block::getContactListPtr()
{
return &contactList;
}
int Block::getMaxNumberContact() const
{
return maxNumberContact;
}
int Block::getNumberContact() const
{
return numberContact;
}
void Block::setContactList(Contact *ptr)
{
//no "delete contactList" here because this is executed after cuda. The contactList is pointing to nothing and deleteing it will cause an error
contactList = ptr;
}
void Block::addContact(Contact contact_value)
{
if(numberContact < maxNumberContact)
{
contactList[numberContact] = contact_value;
numberContact = numberContact + 1;
}
else //find a way to throw an error because the list is too small for all the contacts
{
printf("TOO MANY CONTACTS ON ONE GRAIN");
}
}
void Block::clearContactList()
{
//delete[] contactList;
//contactList = new Contact[maxNumberContact];
if(numberContact > 0)
{
numberContact = 0;
}
}
void Block::deleteBlockData()
{
delete[] contactList;
}
__host__ __device__ Block& Block::operator=(Block const& BlockToCopy)
{
if(this != &BlockToCopy) //to check we are not doing a = a
{
Id = BlockToCopy.Id;
numberContact = BlockToCopy.numberContact;
old_numberContact = BlockToCopy.old_numberContact;
maxNumberContact = BlockToCopy.maxNumberContact;
delete[] contactList;
contactList = new Contact[maxNumberContact];
for(int i =0; i <numberContact; i++)
{
contactList[i] = BlockToCopy.contactList[i];
}
}
return *this;
}
//class Grain functions
Grain::Grain(void)
{
number = global_totalNumberGrain;
global_totalNumberGrain = global_totalNumberGrain + 1;
totalNumberGrain = -1;//safety
//initialize Vec3d2
position = new Vec3d2;
}
Grain::Grain(Grain const& grainToCopy)
{
cout <<"COPY CONSTRUCTOR OF GRAIN IS NOT DONE YET"<<endl;
system("PAUSE");
//totalNumberGrain = grainToCopy.totalNumberGrain;
//radius = grainToCopy.radius;
//diameter = grainToCopy.diameter;
//volume = grainToCopy.volume;
//inertia = grainToCopy.inertia;
//position = new Vec3d2;
//old_position = new Vec3d2;
//old_velocity = new Vec3d2;
//old_acceleration = new Vec3d2;
//old_angularVelocity = new Vec3d2;
//old_angularAcceleration = new Vec3d2;
//gravityForce = new Vec3d2;
//*position = *grainToCopy.position;
//*old_position = *grainToCopy.old_position;
//*old_velocity = *grainToCopy.old_velocity;
//*old_acceleration = *grainToCopy.old_acceleration;
//*old_angularVelocity = *grainToCopy.old_angularVelocity;
//*old_angularAcceleration = *grainToCopy.old_angularAcceleration;
//*gravityForce = *grainToCopy.gravityForce;
}
Grain::Grain(Vec3d2 position_value, double radius_value,double mass_value)//, number(totalNumberGrain)
{
number = global_totalNumberGrain;
global_totalNumberGrain = global_totalNumberGrain + 1;
totalNumberGrain = -1;//safety
radius = radius_value;
//initialize all the Vec3d2 parameters
position = new Vec3d2;
*position = position_value;
}
Grain::~Grain(void)
{
//cout << "CAREFUL, YOU ARE DESTROYING A GRAIN" << endl;//because we should never erase a block
//system("PAUSE");
totalNumberGrain = totalNumberGrain - 1;
delete position;
}
Vec3d2 Grain::getPositionVec() const
{
return *position;
}
Vec3d2* Grain::getPosition() const
{
return position;
}
Vec3d2** Grain::getPositionPtr()
{
return &position;
}
int Grain::getTotalNumberGrain() const
{
return totalNumberGrain;
}
void Grain::setTotalNumberGrain()
{
totalNumberGrain = global_totalNumberGrain;
}
void Grain::setTotalNumberGrain(int number)
{
totalNumberGrain = number;
}
void Grain::setPosition(Vec3d2 *ptr)
{
position = ptr;
}
void Grain::setPositionVec(Vec3d2 position_value)
{
*position = position_value;
}
void Grain::deleteGrainData()
{
delete position;
}
void Grain::findContactwithGrain(Grain *grainList)
{
for(int m = 0; m < n; m++)
{
double length;
length = (*position - (*grainList[m].position)).norm();
if( length < radius + grainList[m].radius)
{
if( number != grainList[m].number) //faster than number != sortedGrainList[m]
{
Vec3d2 relativePosition = *position - (*grainList[m].position) ;
double overlap = radius + grainList[m].radius - relativePosition.norm();
//define the contact
Contact grainContact;
grainContact.setContact(*this, grainList[m], overlap);
addContact(grainContact); //IF YOU PUT THIS LINE IN COMMENT, EVERYTHING GOES A LOT FASTER
}
}
}
}
__host__ __device__ Grain& Grain::operator=(Grain const& grainToCopy)
{
if(this != &grainToCopy)
{
Block::operator=(grainToCopy); //this lines call the operator = defined for Block. So it copies the block attributes of the first grain into the second grain
//totalNumberGrain = grainToCopy.totalNumberGrain;
radius = grainToCopy.radius;
*position = *grainToCopy.position;
}
return *this;
}
//class Analysis functions
Analysis::Analysis(void)
{
}
Analysis::Analysis(Grain *grainList_value)
{
totalNumberGrain = grainList_value[0].getTotalNumberGrain();
grainList = new Grain[totalNumberGrain];
//copy grains
for(int i = 0; i < totalNumberGrain; i++)
{
grainList[i] = grainList_value[i];
grainList[i].setTotalNumberGrain(grainList_value[i].getTotalNumberGrain());
}
}
Analysis::~Analysis(void)
{
delete[] grainList;
//a lot more delete should be made here
}
Grain* Analysis::getGrainList()
{
return grainList;
}
void Analysis::copyToDevice()
{
//declare device grainList and wallList and copy the values
cudaMalloc(&d_grainList, totalNumberGrain*sizeof(Grain));
cudaMemcpy(d_grainList, grainList, totalNumberGrain*sizeof(Grain), cudaMemcpyHostToDevice);
////declare device list of pointer to pass pointer values of grain
d_grain_contactList = new Contact*[totalNumberGrain];
d_grain_position = new Vec3d2*[totalNumberGrain];
for(int i = 0; i < totalNumberGrain; i++)
{
cudaMalloc(&d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact));
cudaMalloc(&d_grain_position[i], sizeof(Vec3d2));
}
//copy pointers and values for grains
for(int i = 0; i < totalNumberGrain; i++)
{
//pointers
cudaMemcpy(d_grainList[i].getContactListPtr(), &d_grain_contactList[i], sizeof(Contact*), cudaMemcpyHostToDevice);
cudaMemcpy(d_grainList[i].getPositionPtr(), &d_grain_position[i], sizeof(Vec3d2*), cudaMemcpyHostToDevice);
//values
cudaMemcpy(d_grain_contactList[i], grainList[i].getContactList(), grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyHostToDevice);
cudaMemcpy(d_grain_position[i], grainList[i].getPosition(), sizeof(Vec3d2), cudaMemcpyHostToDevice);
}
}
void Analysis::copyToHost()
{
//delete the pointer value or it will create a memory leak
for(int i = 0; i < totalNumberGrain; i++)
{
grainList[i].deleteBlockData();
grainList[i].deleteGrainData();
}
//copy non pointer value
cudaMemcpy(grainList, d_grainList, totalNumberGrain*sizeof(Grain),cudaMemcpyDeviceToHost);
//copy pointer values for grains
grain_contactList = new Contact*[totalNumberGrain];
grain_position = new Vec3d2*[totalNumberGrain];
for(int i = 0; i < totalNumberGrain; i++)
{
grain_contactList[i] = new Contact[grainList[i].getMaxNumberContact()];
grain_position[i] = new Vec3d2;
grainList[i].setContactList(grain_contactList[i]);
grainList[i].setPosition(grain_position[i]);
cudaMemcpy(grain_contactList[i], d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyDeviceToHost);
cudaMemcpy(grain_position[i], d_grain_position[i], sizeof(Vec3d2), cudaMemcpyDeviceToHost);
}
}
__global__ void compute( Grain *g)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
//__syncthreads();
if( i < n )
{
g[i].findContactwithGrain(g);
}
}
void Analysis::runAnalysis()
{
for(int i = 0; i < 3; i ++)
{
clock_t begin = clock();
for(int j = 0; j < 10000; j++)
{
compute<<<(n + 511)/512, 512>>>(d_grainList);
}
clock_t end = clock();
cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;
system("PAUSE");
}
}
int main(void)
{
//grain
Vec3d2 position1; position1[0] = 0;position1[1] = 0;position1[2] = 0;
double radius1 = 1;
////cuda
cout << "PRESS ENTER TO START" << endl;
system("PAUSE");
clock_t begin = clock();
Grain *g, *d_g;
g = new Grain[n];
for(int i = 0; i<n; i++)
{
g[i].setTotalNumberGrain();
}
Grain g1(position1, radius1, 0.1);
for(int i = 0; i <n; i++)
{
g[i] = g1;
g[i].setPositionVec(Vec3d2(3*i+1.5, 1.5, 0));
}
Analysis a(g);
a.copyToDevice();
a.runAnalysis();
clock_t end = clock();
cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;
return 0;
}
I would need more code to verify but, the most likely explanation is that when you do not include code you are actually not writing any data to global memory. When you don't write anything to global memory nvcc will optimize just about everything out to the point where you will be running just about nothing.
The same is true when you include a print statement. Print statements are viewed as output, therefore nvcc can't compile out code that it is dependent on.
For example:
__global__ empty_kernel(int* huge_array, int num_elements){
int local_memory;
for(int i=0; i<num_elements; i++){
local_memory+=huge_array[i];
}
}
will run faster than:
__global__ empty_kernel(int* small_array, int num_elements, int* smaller_array){
int tid = ThreadIdx.x+BlockIdx.x*BlockDim.x;
int local_memory;
for(int i=0; i<5; i++){
local_memory+=huge_array[tid*i];
}
smaller_array[tid]=local_memory;
}
The bottom line being, your first kernel isn't faster, it just isn't being run.
The problem in my opinion is simply the if statement, not the statement that it conditionally executes. Conditional branching can be quite expensive on GPU architectures (though it seems to get better with newer architectures), and just having a branching statement could definitely slow down your code.
If you remove the statement within the if clause, the compiler sees that no code is left and therefore can optimize also the if itself away. So this is why you see the speedup when you remove this line of code.
Related
might be a stupid question and if it is, let me know, I will delete it as soon as possible. The thing is I have to make a deep copy in class "Kambarys" (ignore mixed languages, I know I shouldn't do that). Program terminates after trying to call function second time. Probably the problem is my syntax in constructor copy, but I can't find the correct one anywhere. One of the requirements is to create langas, durys and kambarys in dynamic memory using "new" and delete windows vector and door in Kambarys destructor. Appreciate the help!
Requirements:
In the main method, use the new operator to create room k1, add windows and doors to it. Write a constructor Room (const Room & k) that would create a correct copy. In the method main, write another room k2. Calculate the length of the baseboards / wall area.
Perform the following steps: k2 = * k1; delete k1;
#include <iostream>
#include <vector>
#include <iomanip>
using namespace std;
class Langas{
private:
float height;
float widht;
static int countL;
public:
Langas(float h, float w){
this->height=h;
this->widht=w;
countL++;
}
~Langas(){
--countL;
}
float getHeight(){
return height;
}
float getWidht(){
return widht;
}
static int getWindowCount(){
return countL;
}
};
class Durys{
private:
float heightD;
float widhtD;
static int countD;
public:
Durys(float hD, float wD){
this->heightD=hD;
this->widhtD=wD;
countD++;
}
~Durys(){
--countD;
}
float getHeightD(){
return heightD;
}
float getWidhtD(){
return widhtD;
}
static int getDoorCount(){
return countD;
}
};
class Kambarys{
private:
float heightK;
float widhtK;
float lenghtK;
public:
vector<Langas*> windows;
Durys* door;
Kambarys(float hK, float wK, float lK){
this->heightK=hK;
this->widhtK=wK;
this->lenghtK=lK;
}
Kambarys(const Kambarys &k){
this->door=k.door;
this->windows=k.windows;
heightK=k.heightK;
widhtK=k.widhtK;
lenghtK=k.lenghtK;
}
~Kambarys(){
door=NULL;
for(int i=0; i<windows.size(); i++){
delete windows[i];
}
windows.clear();
delete door;
}
float getHeightK(){
return heightK;
}
float getWidhtK(){
return widhtK;
}
float getLenghtK(){
return lenghtK;
}
void addWindow(Langas* w){
windows.push_back(w);
}
void addDoor(Durys *d){
door=d;
}
};
float countWallPlot(Kambarys* k){
float cWPlot=(2*k->getLenghtK()*k->getHeightK())+(2*k->getWidhtK()*k->getHeightK());
for(int i=0; i<k->windows.size(); i++){
cWPlot-=((k->windows[i]->getHeight()))*(k->windows[i]->getWidht());
}
cWPlot-=((k->door->getHeightD()))*(k->door->getWidhtD());
return cWPlot;
}
float countLenght(Kambarys* k){
float floorL=(k->getLenghtK()*k->getWidhtK()*2);
floorL-=(k->door->getWidhtD());
return floorL;
}
int Langas::countL=0;
int Durys::countD=0;
int main(){
Langas *langas1=new Langas(3.4, 1.2);
Durys *durys=new Durys(3.1, 1.5);
Langas *langas2=new Langas(6.4, 1.5);
Kambarys *k=new Kambarys(30.4, 40.1, 50.1);
Kambarys *k2=k;
k->addWindow(langas1);
k->addWindow(langas2);
k->addDoor(durys);
cout<<countWallPlot(k)<<" "<<countLenght(k)<<endl;
cout<<"Window count "<<Langas::getWindowCount()<<", door count "<<Durys::getDoorCount()<<endl;
k2=k;
delete k;
cout<<countWallPlot(k2)<<" "<<countLenght(k2)<<endl;
cout<<"Window count "<<Langas::getWindowCount()<<", door count "<<Durys::getDoorCount()<<endl;
}
You have to allocate memory for k2 and copy the object, not the pointer.
You have to allocate memory in the copy constructor and copy assignment operator.
door=NULL; before delete door; would skip the delete and cause a memory leak.
windows.clear(); is not necessary in the destructor. Keep your code simple.
EDIT: After you added "Perform the following steps: k2 = * k1; delete k1;" I made k2 an object, not a pointer.
#include <iostream>
#include <vector>
class Langas {
private:
float height;
float width;
static int count;
public:
Langas(float h, float w): height(h), width(w) {
++count;
}
~Langas() { --count; }
float getHeight() const { return height; }
float getWidht() const { return width; }
static int getWindowCount() { return count; }
};
class Durys {
private:
float height;
float width;
static int count;
public:
Durys(float h, float w): height(h), width(w) {
++count;
}
~Durys() { --count; }
float getHeight() const { return height; }
float getWidth() const { return width; }
static int getDoorCount() { return count; }
};
class Kambarys {
private:
float height;
float width;
float length;
public:
std::vector<Langas *> windows;
Durys *door = nullptr;
Kambarys(float hK, float wK, float lK): height(hK), width(wK), length(lK) {}
Kambarys(const Kambarys &k): height(k.height), width(k.width), length(k.length), windows(), door(k.door ? new Durys(k.door->getHeight(), k.door->getWidth()) : nullptr) {
for (const auto window : k.windows) {
windows.emplace_back(new Langas(window->getHeight(), window->getWidht()));
}
}
Kambarys &operator=(const Kambarys &k) {
door = k.door ? new Durys(k.door->getHeight(), k.door->getWidth()) : nullptr;
for (const auto window : k.windows) {
windows.emplace_back(new Langas(window->getHeight(), window->getWidht()));
}
height = k.height;
width = k.width;
length = k.length;
return *this;
}
~Kambarys() {
for (auto window : windows) {
delete window;
}
delete door;
}
float getHeight() const { return height; }
float getWidth() const { return width; }
float getLength() const { return length; }
void addWindow(Langas *w) { windows.emplace_back(w); }
void addDoor(Durys *d) { door = d; }
};
float countWallPlot(const Kambarys &k) {
float cWPlot = 2 * k.getLength() * k.getHeight() + 2 * k.getWidth() * k.getHeight();
for (const auto window : k.windows) {
cWPlot -= window->getHeight() * window->getWidht();
}
cWPlot -= k.door->getHeight() * k.door->getWidth();
return cWPlot;
}
float countLength(const Kambarys &k) {
float floor = k.getLength() * k.getWidth() * 2;
floor -= k.door->getWidth();
return floor;
}
int Langas::count = 0;
int Durys::count = 0;
int main() {
Langas *langas1 = new Langas(3.4, 1.2);
Durys *durys = new Durys(3.1, 1.5);
Langas *langas2 = new Langas(6.4, 1.5);
Kambarys *k = new Kambarys(30.4, 40.1, 50.1);
Kambarys k2(*k);
k->addWindow(langas1);
k->addWindow(langas2);
k->addDoor(durys);
std::cout << countWallPlot(*k) << " " << countLength(*k) << std::endl;
k2 = *k;
std::cout << "Window count " << Langas::getWindowCount() << ", door count " << Durys::getDoorCount() << std::endl;
delete k;
std::cout << countWallPlot(k2) << " " << countLength(k2) << std::endl;
std::cout << "Window count " << Langas::getWindowCount() << ", door count " << Durys::getDoorCount() << std::endl;
}
Say I have a simple vector class, vec:
#include <iostream>
#include <stdlib.h>
class vec {
public:
vec() {}
// Constructor.
vec(int n) {
len = n;
data = new double[len];
}
// Destructor.
~vec() { delete [] data; }
// Accessor.
double & operator[](int i) const {
check_index(i);
return data[i];
}
// Other methods...
// ....
protected:
int len;
double * data;
void check_index(int i) const {
if(i < 0 || i >= len) {
std::cerr << "Bad access.\n";
exit(1);
}
}
};
Now suppose I have a special type of vector with sparse structure, e.g., where every even-index is zero. Call this oddvec. Instances of oddvec should be declared just as with the vec class, but underneath, the memory use should be efficient since only half the data is non-zero.
The accessor for the oddvec class should return 0 if the index is even, and return the odd-index element (stored sequentially) otherwise. There a couple problems with this:
The double & return type is violated if the index is even, since the constant value, 0, is returned.
It's not clear to me how to handle the situation when an even index element is used as an lvalue. E.g., v[0] = 3.0 should not be allowed in the oddvec class, but is perfectly acceptable in the vector class. We can't simply throw an error when even indexes are used, because even indexes are fine as long as the intention is as an rvalue.
How do I design the accessor function for the oddvec class, while both keeping the memory storage efficient and inheriting all the methods from the parent?
Non-working example of oddvec:
class oddvec : public vec {
public:
// Constructor.
oddvec(int n) {
len = n;
data = new double[len/2];
}
// Accessor (doesn't work!)
double & operator[](int i) const {
check_index(i);
if (i%2 == 0)
return 0;
else
return data[(i-1)/2];
}
};
Upon compilation:
main.cpp: In member function ‘double& oddvec::operator[](int) const’:
main.cpp:49:20: error: invalid initialization of non-const reference of type ‘double&’ from an rvalue of type ‘double’
return 0;
Working example using proxy classes:
I have implemented a proxy class as suggested in the answer below.
proxies.h
#ifndef PROXIES_H
#define PROXIES_H
#include <iostream>
#include <stdlib.h>
class proxy {
public:
proxy(int i, double v, double * d) {
index = i;
value = v;
data = d;
}
void operator=(double rhs) {
data[index] = rhs;
}
friend std::ostream & operator<<(std::ostream & outs, const proxy & p) {
outs << p.value;
return outs;
}
protected:
int index;
double value;
double * data;
};
class oddproxy : public proxy {
public:
oddproxy(int i, int v, double * d) : proxy(i, v, d) {}
void operator=(double rhs) {
if (index%2 == 0) {
std::cerr << "Even entries of oddvec are not assignable.\n";
exit(1);
}
data[index/2] = rhs;
}
};
#endif
vectors.h
#ifndef VECTORS_H
#define VECTORS_H
#include "proxies.h"
class vec {
public:
vec() {}
// Constructor.
vec(int n) {
len = n;
data = new double[len];
}
// Destructor.
~vec() { delete [] data; }
// Accessor.
proxy operator[](int i) const {
check_index(i);
return proxy(i, data[i], data);
}
inline int length() const { return len; }
// Other methods...
// ....
protected:
int len;
double * data;
void check_index(int i) const {
if(i < 0 || i >= len) {
std::cerr << "Bad access.\n";
exit(1);
}
}
};
class oddvec : public vec {
public:
// Constructor.
oddvec(int n) {
len = n;
data = new double[len/2];
}
// Accessor.
oddproxy operator[](int i) const {
check_index(i);
return oddproxy(i, (i%2 == 0) ? 0 : data[i/2], data);
}
};
#endif
main.cpp
#include <iostream>
#include "vectors.h"
int main () {
int N = 5;
vec V(N);
oddvec O(N);
for(int i=0; i < V.length(); i++) {
V[i] = i;
if(i%2 != 0) {
O[i] = i;
}
}
for(int i=0; i < O.length(); i++) {
std::cout << "V[" << i << "]=" << V[i] << ", "
<< "O[" << i << "]=" << O[i] << "\n";
}
O[0] = 13;
return 0;
}
output
V[0]=0, O[0]=0
V[1]=1, O[1]=1
V[2]=2, O[2]=0
V[3]=3, O[3]=3
V[4]=4, O[4]=0
Even entries of oddvec are not assignable.
You can use proxy object to do this.
simple sample code:
#include <iostream>
#include <vector>
using namespace std;
class very_odd_vector{
public:
class only_odd_proxy;
friend class only_odd_proxy;
only_odd_proxy operator [](int index);
int operator [](int index)const{return index%2==0?0:content[index/2];}
unsigned int size()const{return content.size()*2;}
private:
vector<int> content{1,3,5,7,9};
};
class very_odd_vector::only_odd_proxy{
public:
only_odd_proxy(very_odd_vector& vec,int index):vec(vec),index(index){}
operator int(){return index%2==0 ? 0 : vec.content[index/2];}
only_odd_proxy& operator =(int value){
if(index%2==0)
cout << "BAD OPERATION";//any error you want
else
vec.content[index/2] = value;
return *this;
}
private:
very_odd_vector& vec;
int index;
};
auto very_odd_vector::operator [](int index)->only_odd_proxy{return only_odd_proxy(*this,index);}
int main(){
very_odd_vector v;
cout << "reading value\n";
for(int i=0;i<v.size();++i)
cout << v[i] <<'\n';
cout << "writting value\n";
for(int i=0;i<v.size();++i){
cout << i << ':';
v[i]=10;
cout << '\n';
}
cout << "reading value\n";
for(int i=0;i<v.size();++i)
cout << v[i] <<'\n';
}
Edit for updated part of question :
I think this class will fit your need more.
//Both base and inherit class return this class
class maybe_readonly_proxy {
public:
maybe_readonly_proxy(double* data, bool readonly):readonly(readonly),data(data){}
maybe_readonly_proxy& operator=(double rhs) {
if(readonly){/*whatever error*/}
else {*data = rhs;}
return *this;
}
operator double()const{return *data;}
private:
bool readonly;
double * data;
};
You may need a variable to contain readonly (0 in this case) value, or modify the operator double() the check readonly state
Or just implement get and set method separately and do not use this proxy may be another choice.
So I am testing out some object arrays in C++, and I am trying to delete these objects afterwards, as I am supposed to.
But here's the problem: the deleteInputPattern variable works fine, so I am able to fully delete "inputs" within the CSVFile header class, but its equivalent in the main file, "inputArray", triggers a breakpoint.
What is the problem here? Am I trying to delete non-existent memory? Do any of the pointers need
Code wall below:
InputTest.h:
#pragma once
class InputTest
{
private:
float r;
float g;
float b;
float t;
public:
InputTest();
~InputTest();
InputTest(float r, float g, float b, float t);
void setR(float newT);
float getR();
void setG(float newT);
float getG();
void setB(float newT);
float getB();
void setT(float newT);
float getT();
void print(int count);
};
InputTest.cpp:
#include "InputTest.h"
#include <stdio.h>
InputTest::InputTest()
{
printf("Input constructor\n");
}
InputTest::~InputTest()
{
printf("Input destructor\n");
}
InputTest::InputTest(float r, float g, float b, float t)
{
this->r = r;
this->g = g;
this->b = b;
this->t = t;
}
void InputTest::setR(float newT)
{
r = newT;
}
float InputTest::getR()
{
return r;
}
void InputTest::setG(float newT)
{
g = newT;
}
float InputTest::getG()
{
return g;
}
void InputTest::setB(float newT)
{
b = newT;
}
float InputTest::getB()
{
return b;
}
void InputTest::setT(float newT)
{
t = newT;
}
float InputTest::getT()
{
return t;
}
void InputTest::print(int count)
{
printf("R: %.2f\n", r);
printf("G: %.2f\n", g);
printf("B: %.2f\n", b);
printf("T: %.2f\n", t);
}
Copy.h:
#pragma once
class InputTest;
class Copy
{
private:
int patternCount;
InputTest** inputs;
public:
Copy();
~Copy();
InputTest* getInputPattern(int index);
void addInputPattern(InputTest* in);
void deleteInputPattern();
};
Copy.cpp:
#include "Copy.h"
#include "InputTest.h"
#include <string.h>
#include <stdio.h>
Copy::Copy()
{
printf("CSV File constructor\n");
inputs = NULL;
patternCount = 0;
inputs = new InputTest*[3];
int i;
for (i = 0; i < 3; i++)
{
inputs[i] = new InputTest();
}
}
Copy::~Copy()
{
printf("CSV File destructor\n");
}
InputTest * Copy::getInputPattern(int index)
{
printf("input gotten: %d\n", index);
return inputs[index];
}
void Copy::addInputPattern(InputTest * in)
{
inputs[patternCount] = in;
patternCount++;
printf("input added: %d\n", patternCount);
}
void Copy::deleteInputPattern()
{
int i;
for (i = 0; i < patternCount; i++)
{
delete inputs[i];
}
delete inputs;
inputs = NULL;
}
main.cpp:
#include "Copy.h"
#include "InputTest.h"
#include <string.h>
#include <stdio.h>
int main(int argv, char** argc)
{
bool testResult = false;
Copy *test = NULL;
test = new Copy();
InputTest **inputArray;
inputArray = new InputTest*[3];
int count;
for (count = 0; count < 3; count++)
{
inputArray[count] = new InputTest();
inputArray[count]->setR(0.2f);
inputArray[count]->setG(0.6f);
inputArray[count]->setB(0.8f);
inputArray[count]->setT(0.5f);
test->addInputPattern(inputArray[count]);
inputArray[count] = test->getInputPattern(count);
printf("next\n");
}
for (count = 0; count < 3; count++)
{
printf("round %d\n", count);
printf("R: %f\n", inputArray[count]->getR());
printf("G: %f\n", inputArray[count]->getG());
printf("B: %f\n", inputArray[count]->getB());
printf("T: %f\n", inputArray[count]->getT());
}
test->deleteInputPattern();
for (count = 0; count < 3; count++)
{
delete inputArray[count];
}
delete inputArray;
delete test;
inputArray = NULL;
test = NULL;
return testResult;
}
These seem to be the problematic line:
test->deleteInputPattern();
for (count = 0; count < 3; count++)
{
delete inputArray[count];
}
Since you have already deleted using test->deleteInputPattern(), that memory is freed. Now you are deleting the same objects (to which you are still holding a reference via inputArray) explicitly in main using delete inputArray. But that memory is already deleted in deleteInputPattern and hence you should be getting a memory access error.
You need to free any allocated memory only once. There is no need to do it again in main(). Either call deleteInputPattern or call delete explicitly in main, but not both. I can recommend 2 best practices:
Use smart pointers
The allocating module should delete the memory (this may not be applicable in many situations though such as factories)
my following minimalist Cuda code returns an incorrect result (all polygons have 0 vertices at the end) while the same code running in serial in C++ is working well. The problem is embarrassingly parallel : no communication, no syncthreads etc., and the Cuda memory allocations are sucessful. Even my dummy variable that stores the content of the input array for debug purpose is 0 for the Cuda version. There is no access out of bounds since my arrays are largely large enough. Replacing the memcpy by a loop in Cuda doesn't change anything.
I really don't understand what happens... any idea ? Thanks!
Cuda code:
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
class Point2D {
public:
__device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};
__device__ double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
__device__ Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
__device__ Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
__device__ Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
__device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}
class Polygon {
public:
__device__ Polygon():nbpts(0){};
__device__ void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
__device__ Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
__device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {
int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);
Point2D S = pts[nbpts-1];
for (int i=0; i<nbpts; i++) {
Point2D E = pts[i];
double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}
memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
nbpts = new_nbpts;
}
//private:
Point2D pts[128];
int nbpts;
float dummy;
};
__global__ void cut_poly(float *a, Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N/2) return;
Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));
Point2D curPt(a[2*idx], a[2*idx+1]);
for (int i=0; i<N/2; i++) {
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 100;
float a_h[N], *a_d;
Polygon p_h[N/2], *p_d;
size_t size = N * sizeof(float);
size_t size_pol = N/2 * sizeof(Polygon);
cudaError_t err = cudaMalloc((void **) &a_d, size);
cudaError_t err2 = cudaMalloc((void **) &p_d, size_pol);
for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);
for (int i=0; i<N/2; i++)
printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);
cudaFree(a_d);
cudaFree(p_d);
return 0;
}
Same code in C++ that works properly:
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
class Point2D {
public:
Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};
double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}
class Polygon {
public:
Polygon():nbpts(0){};
void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
void cut(const Point2D &inside_pt, const Point2D &outside_pt) {
int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);
Point2D S = pts[nbpts-1];
for (int i=0; i<nbpts; i++) {
Point2D E = pts[i];
double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}
memcpy(pts, newpts, std::min(128, new_nbpts)*sizeof(Point2D));
/*for (int i=0; i<128; i++) {
pts[i] = newpts[i];
}*/
nbpts = new_nbpts;
}
//private:
Point2D pts[128];
int nbpts;
float dummy;
};
void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
if (idx>=N/2) return;
Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));
Point2D curPt(a[2*idx], a[2*idx+1]);
for (int i=0; i<N/2; i++) {
if (idx==i) continue;
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 100; // Number of elements in arrays
float a_h[N], *a_d; // Pointer to host & device arrays
Polygon p_h[N/2], *p_d;
for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
for (int idx=0; idx<N; idx++)
cut_poly(idx, a_h, p_h, N);
for (int i=0; i<N/2; i++)
printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);
return 0;
}
Well I guess you can disregard most of my comments. I was by mistake working on a machine I had set up with CUDA 3.2 and it was behaving differently along the lines of the kernel launch failure. When I switched to CUDA 4.1 and CUDA 5.0 things started to make sense. Apologies for my confusion there.
Anyway after getting past that, I pretty quickly noticed that there is a difference between your CPU and GPU implementations. Specifically here (looking at the CPU code):
void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
if (idx>=N/2) return;
Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));
Point2D curPt(a[2*idx], a[2*idx+1]);
for (int i=0; i<N/2; i++) {
if (idx==i) continue; /* NOTE THIS LINE MISSING FROM YOUR GPU CODE */
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];
polygons[idx] = pol;
}
Referring to the line I have added the comment to above, if you add that exact line of code to the corresponding spot in your GPU code in the cut_poly kernel, then for me anyway the GPU code produces the same printed result as the CPU code.
One other observation I would make is that you are needlessly running blocks with just 4 threads. Nothing wrong with that while you're working out the kinks in the code, but once you have it running for "production" purposes, you will most likely want to target a higher number like 256, and be sure to choose a number that is an integer multiple of 32, for best performance.
In response to a question posted in the comments, I believe that the data is being copied properly, but most likely you are not accessing it correctly on the host. (I don't know how you're determining that "my array is not properly returned to the host"). Most of your class definitions were __device__ only. As a result, it's difficult to access structures within classes on the host (e.g. the Point2D pts class within the Polygon class). I'm inserting modified code here which I think demonstrates that the data is being transferred back to the host:
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
// #include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
class Point2D {
public:
__host__ __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};
__host__ __device__ double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
__host__ __device__ Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
__host__ __device__ Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
__host__ __device__ Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
__host__ __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}
class Polygon {
public:
__host__ __device__ Polygon():nbpts(0){};
__host__ __device__ void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
__host__ __device__ Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
__host__ __device__ Point2D getpoint(unsigned i){
if (i<128) return pts[i];
else return pts[0];
}
__host__ __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {
int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);
Point2D S = pts[nbpts-1];
for (int i=0; i<nbpts; i++) {
Point2D E = pts[i];
double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}
memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
nbpts = new_nbpts;
}
//private:
Point2D pts[128];
int nbpts;
float dummy;
};
__global__ void cut_poly(float *a, Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N/2) return;
Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));
Point2D curPt(a[2*idx], a[2*idx+1]);
for (int i=0; i<N/2; i++) {
if (idx==i) continue;
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = pol.getpoint(0).x;
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 100;
float a_h[N], *a_d;
Polygon p_h[N/2], *p_d;
size_t size = N * sizeof(float);
size_t size_pol = N/2 * sizeof(Polygon);
cudaMalloc((void **) &a_d, size);
cudaCheckErrors("cm1");
cudaMalloc((void **) &p_d, size_pol);
cudaCheckErrors("cm2");
for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaCheckErrors("cmcp1");
int block_size = 128;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);
cudaCheckErrors("kernel");
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaCheckErrors("cmcp2");
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);
cudaCheckErrors("cmcp3");
for (int i=0; i<N/2; i++)
printf("%f \t %f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].getpoint(0).x, p_h[i].nbpts);
cudaFree(a_d);
cudaFree(p_d);
return 0;
}
I would suggest using posting new questions for these things.
I'm trying to play with some code..keep getting an compile error RND not declared in scope I found a part of the code that defined it if it ran on linux and if defined it on windows thus ignoring Mac users(no biggie, I would ignore them too!). I removed that part of the code and defined it using the linux settings(since I figured my Mac is more closer to linux than windows), but then I get the same error but for seed. The odd thing is those seed errors are at the same spot at the RND error was. So my question is what the heck is RND/Seed? My searches found them specific to VB but not sure if its useful since I'm using C++.
Here's an offensive code snipped(viewers discretion is advised):
mi = (int)(round(RND*(dimc-1)));
Any tips/suggestions would be great. I'm just starting to learn about c++ so I maybe missing something very simple.
Here's the entire code(stole it from here http://cg.iit.bme.hu/~zsolnai/gfx/genetic/ ):
// a fast genetic algorithm for the 0-1 knapsack problem
// by karoly zsolnai - keeroy#cs.bme.hu
// test case: 1000 items, 50 knapsack size
//
// compilation by: g++ genetic.cpp -O3 -ffast-math -fopenmp
#include <math.h>
#include <time.h>
#include <algorithm>
#include <vector>
#include <fstream>
#include <limits.h>
#define RND ((double)rand_r(&seed)/RAND_MAX) // reentrant uniform rnd
using namespace std;
struct chromo {
chromo(int dimc) { items = new bool[dimc]; }
~chromo() { items = NULL; }
void mutate(const int dimc, const int count) {
int mi;
for(int i=0;i<count;i++) {
mi = (int)(round(RND*(dimc-1)));
items[mi] = !items[mi];
}
}
bool* items;
int f;
};
int fitness(bool*& x, const int dimc, const vector<int>& v, const vector<int>& w, const int limit) {
int fit = 0, wsum = 0;
for(int i=0;i<dimc;i++) {
wsum += x[i]*w[i];
fit += x[i]*v[i];
}
if(wsum>limit) fit -= 7*(wsum-limit); // penalty for invalid solutions
return fit;
}
void crossover1p(const chromo& c1, const chromo& c2, const chromo& c3, const int dimc, const int cp) {
for(int i=0;i<dimc;i++) {
if(i<cp) { c3.items[i] = c1.items[i]; }
else { c3.items[i] = c2.items[i]; }
}
}
void crossover1p_b(const chromo &c1, const chromo &c2, const chromo &c3, int dimc, int cp) {
for(int i=0;i<dimc;i++) {
if(i>=cp) { c3.items[i] = c1.items[i]; }
else { c3.items[i] = c2.items[i]; }
}
}
void crossoverrand(const chromo &c1, const chromo &c2, const chromo &c3, const int dimc) {
for(int i=0;i<dimc;i++) {
if(round(RND)) { c3.items[i] = c1.items[i]; }
else { c3.items[i] = c2.items[i]; }
}
}
void crossoverarit(const chromo &c1, const chromo &c2, const chromo &c3, int dimc) {
for(int i=0;i<dimc;i++) {
c3.items[i] = (c1.items[i]^c2.items[i]);
}
}
bool cfit(const chromo &c1,const chromo &c2) { return c1.f > c2.f; }
bool cmpfun(const std::pair<int,double> &r1, const std::pair<int,double> &r2) { return r1.second > r2.second; }
int coin(const double crp) { // a cointoss
if(RND<crp) return 1; // crossover
else return 0; // mutation
}
// initializes the chromosomes with the results of a greedy algorithm
void initpopg(bool**& c, const std::vector<int> &w, const std::vector<int> &v, const int dimw, const int limit, const int pop) {
std::vector<std::pair<int,double> > rvals(dimw);
std::vector<int> index(dimw,0);
for(int i=0;i<dimw;i++) {
rvals.push_back(std::pair<int,double>(std::make_pair(i,(double)v[i]/(double)w[i])));
}
std::sort(rvals.begin(),rvals.end(),cmpfun);
int currentw = 0, k;
for(int i=0;i<dimw;i++) {
k = rvals[i].first;
if(currentw + w[k] <= limit) { // greedy fill
currentw += w[k];
index[k] = 1;
}
}
for(int i=0;i<pop;i++) {
for(int j=0;j<dimw;j++) {
c[i][j] = index[j];
}
}
}
int main() {
printf("\n");
srand(time(NULL));
vector<int> w, v; // items weights and values
int info=0;
FILE *f = fopen("1000_weights.txt","r");
FILE *f2 = fopen("1000_values.txt","r");
while(!feof(f) || !feof(f2) ) {
fscanf(f," %d ",&info);
w.push_back(info);
info=0;
fscanf(f2," %d ",&info);
v.push_back(info);
} // omitted fclose(f1) and fclose(f2) on purpose
const int limit = 50; // knapsack weight limit
const int pop = 250; // chromosome population size
const int gens = INT_MAX; // maximum number of generations
const int disc = (int)(ceil(pop*0.8)); // chromosomes discarded via elitism
const int dimw = w.size();
int best = 0, ind = 0, ind2 = 0; // a few helpers for the main()
int parc = 0; // parent index for crossover
double avg = 0, crp = 0.35; // crossover probability
vector<chromo> ch(pop,chromo(dimw));
bool **c = new bool*[pop];
for(int i=0;i<pop;i++) c[i] = new bool[dimw];
clock_t start = clock();
printf("Initializing population with a greedy algorithm...");
initpopg(c,w,v,dimw,limit,pop);
printf("done!");
for(int i=0;i<pop;i++) {
ch[i].items = c[i];
ch[i].f = fitness(ch[i].items, dimw ,v, w, limit);
}
printf("\n\n");
for(int p=0;p<gens;p++) {
std::sort(ch.begin(), ch.end(), cfit);
#pragma omp parallel for shared(ch)
for(int i=0;i<pop;i++) {
if(i>pop-disc) { // elitism - only processes the discarded chromosomes
if(coin(crp)==1) { // crossover section
ind = parc+round(10*RND); // choosing parents for crossover
ind2 = parc+1+round(10*RND);
// choose a crossover strategy here
crossover1p(ch[ind%pop],ch[ind2%pop],ch[i],dimw,round(RND*(dimw-1)));
// crossoverrand(ch[ind],ch[ind2],ch[i],dimw);
// crossoverarit(ch[0],ch[1],ch[i],dimw);
ch[i].f = fitness(ch[i].items, dimw ,v, w, limit);
parc += 1;
}
else { // mutation section
ch[i].mutate(dimw,1);
ch[i].f = fitness(ch[i].items, dimw ,v, w, limit);
}
}
avg += ch[i].f;
if(ch[i].f>best) best=ch[i].f;
}
parc = 0;
if(p%5==0) {
printf("\n#%d\t",p);
printf("best fitness: %d \t",best);
printf("avg fitness: %f",avg/pop);
if(best == 675) goto end; // psst...don't tell anyone
}
best = avg = 0;
}
end:
printf("\n\n");
clock_t end = clock();
double t = (double)(end-start)/CLOCKS_PER_SEC;
printf("\nCompletion time: %fs.\n",t);
return 0;
}
The problem is you've inexpertly cut apart the code you've got:
#if defined(__linux) || defined(__linux__)
unsigned int seed = time(NULL);
#define RND ((double)rand_r(&seed)/RAND_MAX) // reentrant uniform rnd
#endif
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
#define RND ((double)rand()/RAND_MAX) // uniform rnd
#endif
This defines the seed variable based on the current time for Linux systems; perhaps the Windows systems do not need a seed?
In any event, if you include both lines from the if defined (__linux) ... branch, instead of only one line, it should work without trouble on your OS X system.