C++ Deep Copy Object - c++

I am trying to deep copy objects back and forth. When I run the gdb, I get the following error after one iteration of the loop.
Program received signal SIGSEGV, Segmentation fault.
0x0804ab96 in DGCPM::DGCPM (this=0x844b760, cur=0x1) at DGCPM.C:27
27 memcpy(vRCells, cur->vRCells,sizeof(float)*nThetaCells);
I suspect the problem has to do with creating the "new class," but I'm not sure. Any suggestions?
(Note: The "_initialize" code calls a FORTRAN subroutine that sets the values in the program.)
Here is the run.C main file:
#include "../include/DGCPM.h"
#define particle_num 5
class DGCPM **mallocModels(int n);
int main(int argc, char *argv[]){
class DGCPM **m;
class DGCPM **cur;
m=mallocModels(particle_num);//update
for(int t = 0; t < 48; t++){
//Update m, and then...
cur = m;
m = (DGCPM**)malloc(sizeof(class DGCPM *)*particle_num);
for(int i=0;i<particle_num;i++){
randomidx = ((double)rand() / ((double)RAND_MAX + 1));
currentidx = find(cumPw,randomidx,particle_num);
m[i] = new class DGCPM(cur[currentidx]);
}
for(int i=0;i<particle_num;i++){
delete cur[i];
}
free(cur);
}
return 0;
}
/*============================================================================
mallocModels - allocate the ensemble of models
============================================================================*/
class DGCPM **mallocModels(int n){
class DGCPM **m;
m=(class DGCPM **)amjSafeMalloc(sizeof(class DGCPM *)*n,
(char *)"mallocModels:m");
for(int i=0;i<n;i++)
m[i]=new class DGCPM();
return m;
}
/*============================================================================
Find - Return a particle index that has a high probability of having a high weight.
============================================================================*/
int find(float *cumPw, double randomidx, int nM){
/*Wrong implementation*/
int index = 0;
flag = 0;
while(flag == 0){
if(cumPw[i] >= randomidx){
flag = 1;
i++;
}
else{
index ++;
}
}
return index; //Sometimes, index was going to number of models, or number of models + 1, which are out of bounds.
/*Correct implementation*/
int index = 0;
for(int i = 0; i < nM-1; i++){
if(cumPw[i] >= randomidx){
index = i;
break;
}
}
if(index >= nM){
index = nM-1;
printf("Error: random index exceeds bounds");
}
return index;
}
Here is the DGCPM.h header file:
class DGCPM{
public:
DGCPM(); /* Initialized with defaults setup */
DGCPM(class DGCPM *cur); //Copy constructor
DGCPM(int nThetaCells, int nPhiCells, float thetaMin, float thetaMax);
~DGCPM(); /* Free memory */
private:
int internal; /* 1=memory allocated internally and should be deallocated when ~DGCPM is called, 2=memory is internal except for mGridN which is external */
int nThetaCells,nRCells,nPhiCells;
float thetaMin,thetaMax;
float rMin,rMax;
float delR,delPhi;
float deltMax;
float *vRCells; /* [nThetaCells] */
float *vThetaCells; /* [nThetaCells] */
float *vPhiCells; /* [nPhiCells] */
float **mGridB; /* [nPhiCells][nThetaCells] */
float **mGridBi; /* [nPhiCells][nThetaCells] */
float **mGridPot; /* [nPhiCells][nThetaCells] */
float **mGridEr; /* [nPhiCells][nThetaCells] */
float **mGridEp; /* [nPhiCells][nThetaCells] */
float **mGridVr; /* [nPhiCells][nThetaCells] */
float **mGridVp; /* [nPhiCells][nThetaCells] */
float **mGridN; /* [nPhiCells][nThetaCells] */
float **mGridHalf; /* [nPhiCells][nThetaCells] Particles / weber (workspace for upwind and superbee) */
float **mGridDen; /* [nPhiCells][nThetaCells] */
float **mGridVol; /* [nPhiCells][nThetaCells] */
float **mGridX; /* [nPhiCells][nThetaCells] */
float **mGridY; /* [nPhiCells][nThetaCells] */
float **mGridOc; /* [nPhiCells][nThetaCells] */
float **std; /* [nPhiCells][nThetaCells] */
float parI[2];
float delTMax;
float Re;
void initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax);
};
And finally the DGCPM.C object wrapper:
/******************************************************************************
* DGCPM.C - This implements the DGCPM plasmasphere model class *
******************************************************************************/
#define TWO_PI 6.2831853071795864769252866
#include "../include/DGCPM.h"
# include <cstdlib>
# include <cmath>
/*============================================================================
DGCPM::DGCPM()
Initialize with default setup
============================================================================*/
DGCPM::DGCPM(){
internal=1;
initialize(200,200,14.963217,60.0);/*(180,200,14.963217,60.0);*/
}
//Copy Constructor
DGCPM::DGCPM(class DGCPM *cur){
internal=1;
initialize(200,200,14.963217,60.0);/*(180,200,14.963217,60.0);*/
memcpy(vRCells, cur->vRCells,sizeof(float)*nThetaCells);
memcpy(vPhiCells, cur->vPhiCells,sizeof(float)*nPhiCells);
memcpy(vThetaCells, cur->vThetaCells,sizeof(float)*nThetaCells);
memcpy(mGridB[0], cur->mGridB[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridBi[0], cur->mGridBi[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridPot[0], cur->mGridPot[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridEr[0], cur->mGridEr[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridEp[0], cur->mGridEp[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridVr[0], cur->mGridVr[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridVp[0], cur->mGridVp[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridN[0], cur->mGridN[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridHalf[0], cur->mGridHalf[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridDen[0], cur->mGridDen[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridVol[0], cur->mGridVol[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridOc[0], cur->mGridOc[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridX[0], cur->mGridX[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridY[0], cur->mGridY[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(std[0], cur->std[0],sizeof(float)*nThetaCells*nPhiCells);
}
/*============================================================================
DGCPM::~DGCPM()
Free allocated memory
============================================================================*/
DGCPM::~DGCPM(){
if(internal>=1){
amjFree1dFloat(vRCells);
amjFree1dFloat(vThetaCells);
amjFree1dFloat(vPhiCells);
amjFree2dFloat(mGridB);
amjFree2dFloat(mGridBi);
amjFree2dFloat(mGridEr);
amjFree2dFloat(mGridEp);
amjFree2dFloat(mGridVr);
amjFree2dFloat(mGridVp);
if(internal==1) amjFree2dFloat(mGridN);
amjFree2dFloat(mGridHalf);
amjFree2dFloat(mGridDen);
amjFree2dFloat(mGridVol);
amjFree2dFloat(mGridX);
amjFree2dFloat(mGridY);
amjFree2dFloat(mGridOc);
amjFree2dFloat(std);
}
}
/******************************************************************************
******************************************************************************
** Private functions **
******************************************************************************
******************************************************************************/
/*============================================================================
DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax);
This is the initialization function used when all memory should be
allocated internally.
============================================================================*/
void DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax){
initialize(nThetaCells,nPhiCells,thetaMin,thetaMax,
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vRCells"),
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vThetaCells"),
amjMalloc1dFloat(nPhiCells,(char *)"DGCPM::DGCPM:vPhiCells"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridB"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridBi"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridPot"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEp"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVp"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridN"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridHalf"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridDen"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVol"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridX"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridY"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridOc"),
//Added by J.Wise
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:std"));
}
/*============================================================================
DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax);
This is the initialization function used when mGridN is passed from
the outside but all other memory is allocated internally.
============================================================================*/
void DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax, float **mGridN){
initialize(nThetaCells,nPhiCells,thetaMin,thetaMax,
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vRCells"),
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vThetaCells"),
amjMalloc1dFloat(nPhiCells,(char *)"DGCPM::DGCPM:vPhiCells"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridB"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridBi"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridPot"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEp"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVp"),
mGridN,
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridHalf"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridDen"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVol"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridX"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridY"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridOc"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:std"));
}
/*
initialize() - this initialization function uses pre-allocated
memory areas passed in from the outside. This function is used both
when DGCPM allocates memory itself and when it receives
pre-allocated memory from the outside in order to eliminate
duplication of code with the associated risk of errors.
============================================================================*/
void DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax, float *vRCells, float *vThetaCells,
float *vPhiCells, float **mGridB, float **mGridBi,
float **mGridPot, float **mGridEr, float **mGridEp,
float **mGridVr, float **mGridVp, float **mGridN,
float **mGridHalf, float **mGridDen, float **mGridVol,
float **mGridX, float **mGridY, float **mGridOc, float **std){
DGCPM::nThetaCells=nThetaCells;
DGCPM::nPhiCells=nPhiCells;
DGCPM::thetaMin=thetaMin;
DGCPM::thetaMax=thetaMax;
DGCPM::vRCells=vRCells;
DGCPM::vThetaCells=vThetaCells;
DGCPM::vPhiCells=vPhiCells;
DGCPM::mGridB=mGridB;
DGCPM::mGridBi=mGridBi;
DGCPM::mGridPot=mGridPot;
DGCPM::mGridEr=mGridEr;
DGCPM::mGridEp=mGridEp;
DGCPM::mGridVr=mGridVr;
DGCPM::mGridVp=mGridVp;
DGCPM::mGridN=mGridN;
DGCPM::mGridHalf=mGridHalf;
DGCPM::mGridDen=mGridDen;
DGCPM::mGridVol=mGridVol;
DGCPM::mGridX=mGridX;
DGCPM::mGridY=mGridY;
DGCPM::mGridOc=mGridOc;
DGCPM::std=std;
Re=6.378e6;
initialize_(&nThetaCells,&nRCells,&nPhiCells,&thetaMin,&thetaMax,&rMin,&rMax,
&delR,&delPhi,vRCells,vThetaCells,vPhiCells,mGridB[0],mGridBi[0],
mGridN[0],mGridDen[0],mGridVol[0],mGridX[0],mGridY[0],mGridOc[0],std[0]);
}
Here's a sample custom memory function, which takes care of initialization and allocation:
void *amjSafeMalloc(int n, char *message){
void *d;
d=malloc(n);
if(d==NULL){
fprintf(stderr,"amjSafeMalloc error: Could not allocate %d bytes "
"for %s. Exiting.\n",n,message);
exit(1);
}
return d;
}
float *amjMalloc1dFloat(int a, char *message){
float *d;
sprintf(msg,"%s:amjMalloc1DFloat:d",message);
d=(float *)amjSafeMalloc(sizeof(float)*a,msg);
return d;
}
float **amjMalloc2dFloat(int a, int b, char *message){
float **d;
int i;
sprintf(msg,"%s:amjMalloc2DFloat:d",message);
d=(float **)amjSafeMalloc(sizeof(float *)*a,msg);
sprintf(msg,"%s:amjMalloc2DFloat:d[0]",message);
d[0]=(float *)amjSafeMalloc(sizeof(float)*a*b,msg);
for(i=1;i<a;i++) d[i]=d[i-1]+b;
return d;
}

class DGCPM
{
public:
DGCPM(int nThetaCells, int nPhiCells)
: nThetaCells(nThetaCells)
, nPhiCells(nPhiCells)
, mGridB(nThetaCells, vector<float>(nPhiCells)) // first Y then X
{
}
private:
int nThetaCells, nPhiCells;
vector<vector<float>> mGridB;
};
Deep copies for free. Deletes memory for free.
By free I mean you don't have to write the code..

From your comment /* [nPhiCells][nThetaCells] */ in your class definition, I take it that you intent the float** to be 2D arrays. However, if you can use them like 2D arrays, they are actually arrays of pointers to arrays. That is a huge difference: it means, you have to copy nPhiCells individual arrays of nThetaCells elements and you have to setup the pointer array itself. Now, when you do
memcpy(mGridHalf[0], cur->mGridHalf[0],sizeof(float)*nThetaCells*nPhiCells);
in your copy constructor, you assume that there is no pointer array, and that all line arrays are sequential in memory. Either this copy exceeds the bounds of the pointer array (segfaulting), or accessing you array via mGridHalf[i][j] simply does the wrong thing, reinterpreting float data as pointers (and segfaulting).
Unfortunately, C++ is a horrible language for interacting with fortran multidimensional arrays because it has no notion of variable sized arrays. So the following is C code, not C++ code. In C, you can tackle the issue like this:
float (*mGridHalf)[nThetaCells] = malloc(nPhiCells*sizeof(*mGridHalf));
will correctly allocate and type a 2D array (i. e. an array of arrays) that can be accessed with
mGridHalf[phi][theta] = 7.3;
Since all elements are consecutive in memory, the entire thing can correctly be copied with
memcpy(mGridHalf, cur->mGridHalf, nPhiCells*sizeof(*mGridHalf));
and freed with
free(mGridHalf);
Technically, mGridHalf is now a pointer to an array, the pointer arithmetic that is invoked by the array access effectively does the same computation as if you had written:
float* foo = malloc(nPhiCells*nThetaCells*sizeof(*foo));
foo[phi*nThetaCells + theta] = 7.3;
However, using the correct pointer type float (*)[nThetaCells] allows you to avoid doing the index computation yourself.

The issue is more than likely you're assuming that float** has data that is one contiguous chunk of memory. If so, here is one way of accomplishing this. First, I show the wrong way (but used often):
float** createFloat2D(int nRows, int nCols)
{
float** p1 = new float*[nRows];
for (int i = 0; i < nCols; ++i )
p1[i] = new float[nCols];
return p1;
}
void destroyFloat2D(float**f, int nRows, int nCols)
{
for (int i = 0; i < nCols; ++i )
delete [] f[i];
delete [] f;
}
Looks simple, and works for most purposes, but will fail if the assumption is made that the data is in a contiguous chunk of memory.
The other way to create a 2D array is to make the data contiguous.
float** createFloat2D(int nRows, int nCols)
{
float** p1 = new float*[nRows]; // allocate row pointers
float* p2 = new float[nRows * nCols]; // allocate data in one chunk
for (int i = 0; i < nCols; ++i, p2 += nCols )
p1[i] = p2; // point the row pointers into the pool of memory
return p1;
}
void destroyFloat2D(float**f)
{
delete [] f[0];
delete [] f;
}
Note above that the data is created in one contiguous "pool". Now, using yourArray[0] actually points to the beginning of this memory. Also note that destruction is done without having to know the number of rows or columns, since f[0] points to the pool of memory.
So now, code like this should work
float** mGridB = createFloat2D(nThetaCells, nPhiCells);
//...
memcpy(mGridB[0], cur->mGridB[0], sizeof(float)*nThetaCells*nPhiCells);
The code above now works correctly, if we use the second method of creating the 2d array.
I would still stick with the vector for 1-d float arrays, as you have the pointer to the data (see my earlier comment). For the code above, I would wrap it in a class that handles creation and destruction easily.
The last thing is the copy constructor. A copy constructor in C++ has the following possible signatures:
DGCPM(const DGCPM&);
DGCPM(DGCPM&);
DGCPM(volatile DBCPM&);
I may have missed one, but the signature should be one of those above, more than likely, the first one (you can also have additional arguments after the reference argument, but they all must have default values).
Note that a DBCPM* is not a valid argument for a copy constructor as your code stated -- remember that a copy constructor is not only for use, but also the compiler will use it to make copies. So to signal the compiler that "yes, this function is used to make copies", your function must match one of the signatures above.
In addition, you need an assignment operator, in other words, the class needs to implement the "rule of 3".

This going to sound so stupid (elementary programming error): my index "i" was going beyond (number of models - 1), so I was getting a segmentation fault from accessing memory that didn't exist.

Related

Variable gets lost after allocating array of structs in cuda

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.
A simple version (the real one has various structs and variables/arrays inside) of the struct is:
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.
However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.
This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.
What am I doing wrong and why does this happen? How do I fix it?
Let me know if you need more information.
MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));
// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));
//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));
// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
for (int i = 0; i < inGraph->nNode; i++){
// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));
return outGraph;
}
The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.
To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.
I have answered a similar question here. Please take a look at it.
The fixed code may look like this:
#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph {
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph, unsigned int * d_res) {
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++) {
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++) {
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%u\n", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph)
{
//Create auxiliary Graph variable on host
Graph temp;
//copy constants
temp.nNode = inGraph->nNode;
temp.nBoundary = inGraph->nBoundary;
// copy boundary
gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));
//Create nodes
size_t nodeBytesTotal = temp.nNode * sizeof(Node);
gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));
for (int i = 0; i < temp.nNode; i++)
{
//Create auxiliary node on host
Node auxNodeHost;
//Allocate device memory to position member of auxillary node
size_t nodeBytes = 3 * sizeof(float);
gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));
//Copy auxillary host node to device
Node* dPtr = temp.node + i;
gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
}
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));
return outGraph;
}
Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.
The above code does not do that and is just for demonstration purpose.
Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

An error cannot convert from 'void *' to 'float *' `

I write a c++ function and its associated mex. But the one kind of input of c++ function is double *.
The output of function pointwise_search is a pointer. I was told that I should delete it. But I do not know where I should delete it since I need it as an output.
From the answer, I know I should check the type of input by mxIsSingle. So I correct the function mexFunction. But there is an error error C2440: '=' : cannot convert from 'void *' to 'float *'.
In matlab, I should call like pointwise_search(float *p,float q, num_thres,float n, len ). If I have a vector v_in_matlab=rand(5,1) in matlab. I should I get it pointer by p=single(v_in_matlab); and then pointwise_search(p...
Thanks in advance.
#include "mex.h"
#include <iostream>
#include <algorithm>
#include <functional>
#include <vector>
using namespace std;
float * pointwise_search(float *p,float *q,int num_thres, float* n, int len )
{
vector<float> P(p, p + num_thres);
vector<float> Q(q, q + num_thres);
int size_of_threshold = P.size();
float *Y=new float[len];
float *z=new float[len];
typedef vector<float > ::iterator IntVectorIt ;
IntVectorIt start, end, it, location ;
start = P.begin() ; // location of first
// element of Numbers
end = P.end() ; // one past the location
// last element of Numbers
for (int i=0;i<len;i++)
{
location=lower_bound(start, end, n[i]) ;
z[i]=location - start;
if(z[i]>0&&z[i]<size_of_threshold)
{
Y[i]=(n[i]-P[z[i]])/(P[z[i]-1]-P[z[i]])*(Q[z[i]-1]-Q[z[i]])+Q[z[i]];
}
else
{
Y[i]=Q[z[i]];
}
}
return (&Y[0]);
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
float * Numbers, *Q;
if (nrhs != 5)
{
mexErrMsgTxt("Input is wrong!");
}
float *n = (float*) mxGetData(prhs[3]);
int len = (int) mxGetScalar(prhs[4]);
int num_thres = (int) mxGetScalar(prhs[2]);
/* Input gs */
if(mxIsComplex(prhs[0])
||!mxIsSingle(prhs[0]))
mexErrMsgTxt("Input 0 should be a class Single");
/* get the pointer to gs */
Numbers=mxGetData(prhs[0]);
if(mxIsComplex(prhs[0])
||!mxIsSingle(prhs[0]))
mexErrMsgTxt("Input 0 should be a class Single");
/* get the pointer to gs */
Q=mxGetData(prhs[1]);
// float * Numbers= (float *)mxGetData(prhs[0]);
// float * Q= (float *)mxGetData(prhs[1]);
float * out= pointwise_search(Numbers,Q,num_thres,n,len );
//float* resizedDims = (float*)mxGetPr(out);
}
In Matlab use single() to convert the data before calling the mexFunction. On the C++ side verify that the type is indeed single by mxIsSingle(). After this you can happily cast to float*.
Before you worry about your MEX code, have another look at your C++ function first. You have some really obvious memory leaks (new but no delete[]).
Regarding MEX you should never see this:
(float *)mxGetPr(prhs[0])
You can't cast a double* to a float* and expect the numbers to make any sense. Input single from MATLAB and use:
(float *)mxGetData(prhs[0])
And do as Trilarion suggests and test all of your input mxArrays for the expected data type.

Why does reverse this function not work

In the constructor I fill the array on the device side.
but now I want to execute reverse function on the array.
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
__global__ void generateVector(int *data,int count){
int tid = blockIdx.x;
data[tid] = -tid;
}
__global__ void reverseArray(int *data,int count){
int tid = blockIdx.x;
data[tid] = tid;
}
class FData{
private:
int *data;
int size;
public:
FData(int sizeP){
size = sizeP;
data = new int[size];
int *devA;
cudaMalloc((void**) &devA, size * sizeof(int));
generateVector<<<size,1>>>(devA,size);
cudaMemcpy(data,devA, size * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(devA);
}
~FData(){
delete [] data;
}
int getSize(){
return size;
}
int elementAt(int i){
return data[i];
}
void reverse(){
int *devA;
cudaMalloc((void**) &devA, sizeof(int));
reverseArray<<<size,1>>>(devA,size);
cudaMemcpy(data,devA,size * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(devA);
}
};
int main(void) {
FData arr(30);
cout << arr.elementAt(1);
arr.reverse();
cout << arr.elementAt(1);
return 0;
}
It still prints the values which I filled in the constructor. What is the problem here? How can i solve it? What is going wrong?
Your kernels aren't reversing anything. They're just negating the values, so if anything I would be quite surprised if you saw anything get reversed. With that said, if you add error checking to your code (see this other SO post on how best to do the error checking) then you'll see that your code will fail on the call to cudaMalloc in your reverse function. You can fix this by changing devA to be a plain pointer (it doesn't really make sense for you to be allocating it as a host-array anyways, as you're not using it on the host to begin with).
void reverse(){
int *devA;
cudaMalloc((void**) &devA, size * sizeof(int));
reverseArray<<<size,1>>>(devA,size);
cudaMemcpy(data,devA,size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(devA);
}
Also, you should free your memory too, you have both host-side and device-side memory leaks. Whenever you have a cudaMalloc call, you should havea corresponding cudaFree. Also, consider adding a destructor to free your host-side data member, as you have a memory leak there too.
~FData()
{
delete [] data;
}

Constructors and array of object in C++

I'm trying to create an application in C++. In the application I have the default constructor and another constructor with 3 arguments.
The user is providing from the keyboard an integer that it will be used to create an array of objects using the non default constructor.
Unfortunately I haven't been able to finish it till now, since I'm having issues with the creation of the array of objects that they will use the non default constructor.
Any suggestions or help?
#include<iostream>
#include<cstring>
#include<cstdlib>
#include <sstream>
using namespace std;
class Station{
public:
Station();
Station(int c, char *ad, float a[]);
~Station();
void setAddress(char * addr){
char* a;
a = (char *)(malloc(sizeof(addr+1)));
strcpy(a,addr);
this->address = a;
}
void setCode(int c){
code=c;
}
char getAddress(){
return *address;
}
int getCode(){
return code;
}
float getTotalAmount(){
float totalAmount=0;
for(int i=0;i<4;i++){
totalAmount+=amount[i];
}
return totalAmount;
}
void print(){
cout<<"Code:"<<code<<endl;
cout<<"Address:"<<address<<endl;
cout<<"Total Amount:"<<getTotalAmount()<<endl;
cout<<endl;
}
private:
int code;
char *address;
float amount[4];
};
Station::Station(){
code= 1;
setAddress("NO ADDRESS GIVEN");
amount[0]= 0.0;
amount[1]= 0.0;
amount[2]= 0.0;
amount[3]= 0.0;
}
Station::Station(int c, char *ad, float a[]){
if( (c>=1&& c<=10 ) ){
code=c;
address=ad;
for(int i=0;i<4;i++){
amount[i]=a[i];
}
}else{
code= 1;
setAddress("NO ADDRESS GIVEN");
amount[0]= 0.0;
amount[1]= 0.0;
amount[2]= 0.0;
amount[3]= 0.0;
}
}
Station::~Station(){
}
int main(){
int size,code;
char *addrr;
addrr = (char *)(malloc(sizeof(addrr+1)));
float mes[4];
do{
cout<<"size of array:";
cin>>size;
}while(size<=0 || size>=11);
// Station *stations= new Station[size];
// Station** stations = new Station*[size];
Station stations[size];
for(int i=0;i<size;i++){
cout<<"code:";
cin>>code;
cout<<"address:";
cin>>addrr;
double amo=0;
for(int k=0;k<4;k++){
cout<<"values"<<k+1<<":";
cin>>mes[k];
}
}
/*
for(int q=0;q<size;q++){
stations[q].print();
}
*/
return 0;
}
the values that I'll take from cin I want to assign them to the objects of the array!
You can either create the array default-initialized and then fill the array with the wanted object:
foo arr[10];
std::fill(arr, arr+10, foo(some, params));
Alternatively you could use std::vector and do just:
std::vector<foo> arr(10, foo(some, params));
In C++0x, you can use braced-init-list in new expression, which means you can do this:
#include <iostream>
class A
{
public:
A(int i, int j){std::cout<<i<<" "<<j<<'\n';}
};
int main(int argc, char ** argv)
{
int *n = new int[3]{1,2,3};
A *a = new A[3]{{1,2},{3,4},{5,6}};
delete[] a;
delete[] n;
return 0;
}
Compiled under g++ 4.5.2, using g++ -Wall -std=c++0x -pedantic
Since you say you can't use std::string, this is going to be much more difficult. The line addrr = (char *)(malloc(sizeof(addrr+1))); is not doing what you think it is. Instead of using malloc to allocate on the heap and since there is no free (which will lead to a memory leak), it will be much easier if we allocate on the stack with a predetermined buffer size: char addrr[BUFFER_LENGTH]. With BUFFER_LENGTH defined before Station's declaration as const int BUFFER_LENGTH = 20; or some other appropriate length.
To use the non-default constructor, adding stations[i] = Station(c, addrr, mes); at the end of the for loop will do the trick.
for(int i=0;i<size;i++){
cout<<"code:";
cin>>code;
cout<<"address:";
cin>>addrr; // do not read in strings longer than 20 characters or increase BUFFER_LENGTH’s size
double amo=0;
for(int k=0;k<4;k++){
cout<<"values"<<k+1<<":";
cin>>mes[k];
}
stations[i] = Station(c, addrr, mes);
}
But, this is not going to work properly since the constructor is copying the addrr pointer, not the data. I would recommend also changing the data member char *address to char address[BUFFER_LENGTH]. Then, in the constructor you can replace the line address=ad; with strcpy(address, ad);.
Note: setAddress and getAddress will now need to be updated.
Another line that is troubling is Station stations[size];. This is non-standard since size is not a known at compile time. Either use Station *stations= new Station[size]; and remember to delete or if you can use a std::vector, use std::vector<Station> stations(size);
If you do go the std::vector route, using push_back will work nicely:
std::vector<Station> stations;
for(int i=0;i<size;i++){
cout<<"code:";
cin>>code;
cout<<"address:";
cin>>addrr;
double amo=0;
for(int k=0;k<4;k++){
cout<<"values"<<k+1<<":";
cin>>mes[k];
}
stations.push_back( Station(c, addrr, mes) );
}

How to byteswap a double?

I'm trying to write a byteswap routine for a C++ program running on Win XP. I'm compiling with Visual Studio 2008. This is what I've come up with:
int byteswap(int v) // This is good
{
return _byteswap_ulong(v);
}
double byteswap(double v) // This doesn't work for some values
{
union { // This trick is first used in Quake2 source I believe :D
__int64 i;
double d;
} conv;
conv.d = v;
conv.i = _byteswap_uint64(conv.i);
return conv.d;
}
And a function to test:
void testit() {
double a, b, c;
CString str;
for (a = -100; a < 100; a += 0.01) {
b = byteswap(a);
c = byteswap(b);
if (a != c) {
str.Format("%15.15f %15.15f %15.15f", a, c, a - c);
}
}
}
Getting these numbers not matching:
-76.789999999988126 -76.790000000017230 0.000000000029104
-30.499999999987718 -30.499999999994994 0.000000000007276
41.790000000014508 41.790000000029060 -0.000000000014552
90.330000000023560 90.330000000052664 -0.000000000029104
This is after having read through:
How do I convert between big-endian and little-endian values in C++?
Little Endian - Big Endian Problem
You can't use << and >> on double, by the way (unless I'm mistaken?)
Although a double in main memory is 64 bits, on x86 CPUs double-precision registers are 80 bits wide. So if one of your values is stored in a register throughout, but the other makes a round-trip through main memory and is truncated to 64 bits, this could explain the small differences you're seeing.
Maybe you can force variables to live in main memory by taking their address (and printing it, to prevent the compiler from optimizing it out), but I'm not certain that this is guaranteed to work.
b = byteswap(a);
That's a problem. After swapping the bytes, the value is no longer a proper double. Storing it back to a double is going to cause subtle problems when the FPU normalizes the value. You have to store it back into an __int64 (long long). Modify the return type of the method.
Try 3
Okay, found out there's a better way. The other way you have to worry about the order you pack/unpack stuff. This way you don't:
// int and float
static void swap4(void *v)
{
char in[4], out[4];
memcpy(in, v, 4);
out[0] = in[3];
out[1] = in[2];
out[2] = in[1];
out[3] = in[0];
memcpy(v, out, 4);
}
// double
static void swap8(void *v)
{
char in[8], out[8];
memcpy(in, v, 8);
out[0] = in[7];
out[1] = in[6];
out[2] = in[5];
out[3] = in[4];
out[4] = in[3];
out[5] = in[2];
out[6] = in[1];
out[7] = in[0];
memcpy(v, out, 8);
}
typedef struct
{
int theint;
float thefloat;
double thedouble;
} mystruct;
static void swap_mystruct(void *buf)
{
mystruct *ps = (mystruct *) buf;
swap4(&ps->theint);
swap4(&ps->thefloat);
swap8(&ps->thedouble);
}
Send:
char buf[sizeof (mystruct)];
memcpy(buf, &s, sizeof (mystruct));
swap_mystruct(buf);
Recv:
mystruct s;
swap_mystruct(buf);
memcpy(&s, buf, sizeof (mystruct));
Try 2
Okay, got it working! Hans Passant was right. They got me thinking with the "no longer a proper double" comment. So you can't byteswap a float into another float because then it might be in an improper format, so you have to byteswap to a char array and unswap back. This is the code I used:
int pack(int value, char *buf)
{
union temp {
int value;
char c[4];
} in, out;
in.value = value;
out.c[0] = in.c[3];
out.c[1] = in.c[2];
out.c[2] = in.c[1];
out.c[3] = in.c[0];
memcpy(buf, out.c, 4);
return 4;
}
int pack(float value, char *buf)
{
union temp {
float value;
char c[4];
} in, out;
in.value = value;
out.c[0] = in.c[3];
out.c[1] = in.c[2];
out.c[2] = in.c[1];
out.c[3] = in.c[0];
memcpy(buf, out.c, 4);
return 4;
}
int pack(double value, char *buf)
{
union temp {
double value;
char c[8];
} in, out;
in.value = value;
out.c[0] = in.c[7];
out.c[1] = in.c[6];
out.c[2] = in.c[5];
out.c[3] = in.c[4];
out.c[4] = in.c[3];
out.c[5] = in.c[2];
out.c[6] = in.c[1];
out.c[7] = in.c[0];
memcpy(buf, out.c, 8);
return 8;
}
int unpack(char *buf, int *value)
{
union temp {
int value;
char c[4];
} in, out;
memcpy(in.c, buf, 4);
out.c[0] = in.c[3];
out.c[1] = in.c[2];
out.c[2] = in.c[1];
out.c[3] = in.c[0];
memcpy(value, &out.value, 4);
return 4;
}
int unpack(char *buf, float *value)
{
union temp {
float value;
char c[4];
} in, out;
memcpy(in.c, buf, 4);
out.c[0] = in.c[3];
out.c[1] = in.c[2];
out.c[2] = in.c[1];
out.c[3] = in.c[0];
memcpy(value, &out.value, 4);
return 4;
}
int unpack(char *buf, double *value)
{
union temp {
double value;
char c[8];
} in, out;
memcpy(in.c, buf, 8);
out.c[0] = in.c[7];
out.c[1] = in.c[6];
out.c[2] = in.c[5];
out.c[3] = in.c[4];
out.c[4] = in.c[3];
out.c[5] = in.c[2];
out.c[6] = in.c[1];
out.c[7] = in.c[0];
memcpy(value, &out.value, 8);
return 8;
}
And a simple test function:
typedef struct
{
int theint;
float thefloat;
double thedouble;
} mystruct;
void PackStruct()
{
char buf[sizeof (mystruct)];
char *p;
p = buf;
mystruct foo, foo2;
foo.theint = 1;
foo.thefloat = 3.14f;
foo.thedouble = 400.5;
p += pack(foo.theint, p);
p += pack(foo.thefloat, p);
p += pack(foo.thedouble, p);
// Send or recv char array
p = buf;
p += unpack(p, &foo2.theint);
p += unpack(p, &foo2.thefloat);
p += unpack(p, &foo2.thedouble);
}
How to swap the bytes in any basic data type or array of bytes
ie: How to swap the bytes in place in any array, variable, or any other memory block, such as an int16_t, uint16_t, uint32_t, float, double, etc.:
Here's a way to improve the efficiency from 3 entire copy operations of the array to 1.5 entire copy operations of the array. See also the comments I left under your answer. I said:
Get rid of this: memcpy(in, v, 4); and just copy-swap straight into out from v, then memcpy the swapped values back from out into v. This saves you an entire unnecessary copy, reducing your copies of the entire array from 3 to 2.
There's also a further optimization to reduce the copies of the entire array from 2 to 1.5: copy the left half of the array into temporary variables, and the right-half of the array straight into the left-half, swapping as appropriately. Then copy from the temporary variables, which contain the old left-half of the array, into the right-half of the array, swapping as appropriately. This results in the equivalent of only 1.5 copy operations of the entire array, to be more efficient. Do all this in-place in the original array, aside from the temp variables you require for half of the array.
1. Here is my general C and C++ solution:
/// \brief Swap all the bytes in an array to convert from little-endian
/// byte order to big-endian byte order, or vice versa.
/// \note Works for arrays of any size. Swaps the bytes **in place**
/// in the array.
/// \param[in,out] byte_array The array in which to swap the bytes in-place.
/// \param[in] len The length (in bytes) of the array.
/// \return None
void swap_bytes_in_array(uint8_t * byte_array, size_t len)
{
size_t i_left = 0; // index for left side of the array
size_t i_right = len - 1; // index for right side of the array
while (i_left < i_right)
{
// swap left and right bytes
uint8_t left_copy = byte_array[i_left];
byte_array[i_left] = byte_array[i_right];
byte_array[i_right] = left_copy;
i_left++;
i_right--;
}
}
Usage:
// array of bytes
uint8_t bytes_array[16];
// Swap the bytes in this array of bytes in place
swap_bytes_in_array(bytes_array, sizeof(bytes_array));
double d;
// Swap the bytes in the double in place
swap_bytes_in_array((uint8_t*)(&d), sizeof(d));
uint64_t u64;
// swap the bytes in a uint64_t in place
swap_bytes_in_array((uint8_t*)(&u64), sizeof(u64));
2. And here is an optional C++ template wrapper around that to make it even easier to use in C++:
template <typename T>
void swap_bytes(T *var)
{
// Note that `sizeof(*var)` is the exact same thing as `sizeof(T)`
swap_bytes_in_array((uint8_t*)var, sizeof(*var));
}
Usage:
double d;
// Swap the bytes in the double in place
swap_bytes(&d);
uint64_t u64;
// swap the bytes in a uint64_t in place
swap_bytes(&u64);
Notes & unanswered questions
Note, however, that #Hans Passant seems to be onto something here. Although the above works perfectly on any signed or unsigned integer type, and seems to work on float and double for me too, it seems to be broken on long double. I think it's because when I store the swapped long double back into a long double variable, if it is determined to be not-a-valid long double representation anymore, something automatically changes a few of the swapped bytes or something. I'm not entirely sure.
On many 64-bit systems, long double is 16 bytes, so perhaps the solution is to keep the swapped version of the long double inside a 16-byte array and NOT attempt to use it or cast it back to a long double from the uint8_t 16-byte array until either A) it has been sent to the receiver (where the endianness of the system is opposite, so it's in good shape now) and/or B) byte-swapped back again so it's a valid long double again.
Keep the above in mind in case you see problems with float or double types too, as I see with only long double types.
Linux byteswap and endianness and host-to-network byte order utilities
Linux also has a bunch of built-in utilities via gcc GNU extensions that you can use. See:
https://man7.org/linux/man-pages/man3/bswap.3.html - #include <byteswap.h>
https://man7.org/linux/man-pages/man3/endian.3.html - #include <endian.h>
https://man7.org/linux/man-pages/man3/byteorder.3.html - #include <arpa/inet.h> - generally used for network sockets (Ethernet packets) and things; inet stands for "internet"