C++ Pointer to byte array optimization - c++

I am currently using this approach to copy some byte values over:
for (int i = 0; i < (iLen + 1); i++)
{
*(pBuffer + i) = Image.pVid[i];
}
I would like to ask if there is a way to copy these values over in one go, perhaps by using memcopy to gain more speed.
The entire code is:
extern "C" __declspec(dllexport) int __stdcall GetCameraImage(BYTE pBuffer[], int Type, int uWidth, int uHeight)
{
CameraImage Image;
int ret;
Image.pVid = (unsigned int*)malloc(4 * uWidth*uHeight);
ret = stGetCameraImage(&Image, 1, uWidth, uHeight);
if (ret == ERR_SUCCESS)
{
int iLen = (4 * uWidth * uHeight);
for (int i = 0; i < (iLen + 1); i++)
{
*(pBuffer + i) = Image.pVid[i];
}
////print(“ImageType = %d, width = %d, height = %d”, Image.Type, Image.Width,
//// Image.Height);
////print(“First Pixel : B = %d, G = %d, R = %d”, Image.pVid[0], Image.pVid[1],
//// Image.pVid[2]);
////print(“Second Pixel : B = %d, G = %d, R = %d”, Image.pVid[4], Image.pVid[5],
//// Image.pVid[6]);
}
free(Image.pVid);
return ret;
}
Edit:
*pVid is this:
unsigned int *pVid; // pointer to image data (Format RGB32...)

The way your code is currently written, each assignment in your loop will overflow and give you some garbage value in pBuffer because you're trying to assign an unsigned int to a BYTE. On top of that, you will run off the end of the Image.pVid array because i is counting bytes, not unsigned ints
You could fix your code by doing this:
*(pBuffer + i) = ((BYTE*)Image.pVid)[i];
But that is pretty inefficient. Better to move whole words at a time, or you could just use memcpy instead:
memcpy(pBuffer,Image.pVid,iLen) //pBuffer must be at least iLen bytes long

Related

What's the correct way to assign one GPU memory buffer value from another GPU memory buffer with some arithmetic on each source buffer's element?

I'm a newbie for GPU programming using Cuda toolkit, and I have to write some code offering the functionality as I mentioned in the title.
I'd like to paste the code to show what exactly I want to do.
void CTrtModelWrapper::forward(void **bindings,
unsigned height,
unsigned width,
short channel,
ColorSpaceFmt colorFmt,
PixelDataType pixelType) {
uint16_t *devInRawBuffer_ptr = (uint16_t *) bindings[0];
uint16_t *devOutRawBuffer_ptr = (uint16_t *) bindings[1];
const unsigned short bit = 16;
float *devInputBuffer_ptr = nullptr;
float *devOutputBuffer_ptr = nullptr;
unsigned volume = height * width * channel;
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
common::cudaCheck(cudaMalloc((void **) &devOutputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
unsigned short npos = 0;
switch (pixelType) {
case PixelDataType::PDT_INT8: // high 8bit
npos = bit - 8;
break;
case PixelDataType::PDT_INT10: // high 10bit
npos = bit - 10;
break;
default:
break;
}
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
}
break;
default:
break;
}
void *rtBindings[2] = {devInputBuffer_ptr, devOutputBuffer_ptr};
// forward
this->_forward(rtBindings);
// convert output
unsigned short ef_bit = bit - npos;
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devOutRawBuffer_ptr[i] = clip< uint16_t >((uint16_t) devOutputBuffer_ptr[i],
0,
(uint16_t) pow(2, ef_bit)) << npos;
}
}
break;
default:
break;
}
}
bindings is a pointer to an array, the 1st element in the array is a device pointer that points to a buffer allocated using cudaMalloc on the gpu, each element in the buffer is a 16bit integer.the 2nd one the same, used to store the output data.
height,width,channel,colorFmt(RGB here),pixelType(PDT_INT8, aka 8bit) respective to the image height, width,channel number, colorspace, bits to store one pixel value.
the _forward function requires a pointer to an array, similar to bindings except that each element in the buffer should be a 32bit float number.
so I make some transformation using a loop
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
the >> operation is because the actual 8bit data is stored in the high 8 bit.
SEGMENTATION FAULT occurred at this line of code devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); and i equals 0.
I try to separate this code into several line:
uint16_t value = devInRawBuffer_ptr[i];
float transferd = float(value >> npos);
devInputBuffer_ptr[i] = transferd;
and SEGMENTATION FAULT occurred at this line uint16_t value = devInRawBuffer_ptr[i];
I wonder that is this a valid way to assign value to an allocated gpu memory buffer?
PS: the buffer given in bindings are totally fine. they are from host memory using cudaMemcpy before the call to forward function, but I still paste the code below
nvinfer1::DataType type = nvinfer1::DataType::kHALF;
HostBuffer hostInputBuffer(volume, type);
DeviceBuffer deviceInputBuffer(volume, type);
HostBuffer hostOutputBuffer(volume, type);
DeviceBuffer deviceOutputBuffer(volume, type);
// HxWxC --> WxHxC
auto *hostInputDataBuffer = static_cast<unsigned short *>(hostInputBuffer.data());
for (unsigned w = 0; w < W; ++w) {
for (unsigned h = 0; h < H; ++h) {
for (unsigned c = 0; c < C; ++c) {
hostInputDataBuffer[w * H * C + h * C + c] = (unsigned short )(*(ppm.buffer.get() + h * W * C + w * C + c));
}
}
}
auto ret = cudaMemcpy(deviceInputBuffer.data(), hostInputBuffer.data(), volume * getElementSize(type),
cudaMemcpyHostToDevice);
if (ret != 0) {
std::cout << "CUDA failure: " << ret << std::endl;
return EXIT_FAILURE;
}
void *bindings[2] = {deviceInputBuffer.data(), deviceOutputBuffer.data()};
model->forward(bindings, H, W, C, sbsisr::ColorSpaceFmt::CFMT_RGB, sbsisr::PixelDataType::PDT_INT8);
In CUDA, it's generally not advisable to dereference a device pointer in host code. For example, you are creating a "device pointer" when you use cudaMalloc:
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
From the code you have posted, it's not possible to deduce that for devInRawBuffer_ptr but I'll assume it also is a device pointer.
In that case, to perform this operation:
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
}
You would launch a CUDA kernel, something like this:
// put this function definition at file scope
__global__ void shift_kernel(float *dst, uint16_t *src, size_t sz, unsigned short npos){
for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x, idx < sz; idx += gridDim.x*blockDim.x) dst[idx] = (float)((src[idx]) >> npos);
}
// call it like this in your code:
kernel<<<160, 1024>>>(devInputBuffer_ptr, devInRawBuffer_ptr, volume, npos);
(coded in browser, not tested)
If you'd like to learn more about what's going on here, you may wish to study CUDA. For example, you can get most of the basic concepts here and by studying the CUDA sample code vectorAdd. The grid-stride loop is discussed here.

Rewrite C++ class in C [duplicate]

This question already has answers here:
How to find the size of an array (from a pointer pointing to the first element array)?
(17 answers)
Closed 2 years ago.
So, I am having some trouble rewriting a C++ class I made in C.
The C++ class has some private attributes:
int grid_width;
int grid_height;
const int group_width = 2;
const int group_height = 4;
std::vector<int> buffer;
It is initialized like so:
grid::grid(int width, int height) {
this->grid_width = width;
this->grid_height = height;
buffer.resize(this->grid_width / this->group_width * this->grid_height / this->group_height, 0);
}
It also comes with a clear function like so:
void grid::clear() {
// get_buffer_size returns elements in the buffer vector
for (int i = 0; i < get_buffer_size(); ++i) {
buffer[i] = 0x00;
}
}
Now, my attempt to rewrite this in C looks somewhat like this:
typedef struct
{
int width;
int height;
int *buffer;
} grid;
grid *grid_new(int grid_width, int grid_height)
{
if ((grid_width % 2 != 0) || (grid_height % 4 != 0))
return NULL;
int group_height = 4;
int group_width = 2;
grid *p_grid = calloc(grid_width / group_width * grid_height / group_height, sizeof(int));
p_grid->width = grid_width;
p_grid->height = grid_height;
return p_grid;
}
void grid_free(grid *p_grid)
{
free(p_grid->buffer);
free(p_grid);
}
void grid_clear(grid *g)
{
// ToDo: Iterate over all elements in the buffer
int elements = sizeof(g->buffer) / sizeof(int);
printf("Elements: %i", elements);
}
But for some reason, the amount of elements in my C code is always 2?
Does anyone know where I am messing up?
If the grid is initialized with 4 and 8, the expected buffer size should be 4, not 2. If it would be initialized with 10 and 24, the expected size would be 30, but it still remains 2 in my C example.
Your grid_new is allocating an array of grid structs and not a single grid with the correct number of elements.
You need to set buffer
Also, the number of elements in the grid is based on width/height and not sizeof(g->buffer) which is the size of the pointer and not the area to which it points
Here's the refactored code:
const int group_height = 4;
const int group_width = 2;
typedef struct {
int width;
int height;
int *buffer;
} grid;
grid *
grid_new(int grid_width, int grid_height)
{
if ((grid_width % 2 != 0) || (grid_height % 4 != 0))
return NULL;
grid *p_grid = calloc(1,sizeof(*p_grid));
// FIXME -- why???
grid_width /= group_width;
grid_height /= group_height;
p_grid->width = grid_width;
p_grid->height = grid_height;
p_grid->buffer = calloc(grid_width * grid_height,sizeof(int));
return p_grid;
}
void
grid_free(grid *p_grid)
{
free(p_grid->buffer);
free(p_grid);
}
void
grid_clear(grid *g)
{
// ToDo: Iterate over all elements in the buffer
int elements = g->width * g->height;
printf("Elements: %i", elements);
}
'sizeof' returns the number of bytes that specified type takes. in this case sizeof(g->buffer) is equal to sizeof(int*) and because you are using x64 processor sizeof all pointers is 8.

column number 1 is out of range 0..-1

I wrote a function in which I want to pass a pointer to array of structures. I donew() inside it and I want to fill it with data retrieved from PGresult structure (res). However I got errors:
column number 0 is out of range 0..-1
column number 1 is out of range 0..-1
column number 2 is out of range 0..-1
column number 3 is out of range 0..-1
Segmentation fault (core dumped)
here is my function
Database::selectAnnouncements(const int user_id, const char *cat_id, struct announcementStruct **pStruct, int *size) {
*size = PQntuples(res);
struct announcementStruct * ann = new struct announcementStruct[PQntuples(res)];
*pStruct = ann;
int cr_fnum = PQfnumber(res, "created_on");
int con_fnum = PQfnumber(res, "content");
int cat_fnum = PQfnumber(res, "category");
int rm_fnum = PQfnumber(res, "remove_on");
for (int i = 0; i < PQntuples(res); i++) {
const char *cr_val = PQgetvalue(res, i, cr_fnum);
const char *con_val = PQgetvalue(res, i, con_fnum);
const char *cat_val = PQgetvalue(res, i, cat_fnum);
const char *rm_val = PQgetvalue(res, i, rm_fnum);
(*pStruct[i]).creation_date = new char[strlen(cr_val)];
(*pStruct[i]).content = new char[strlen(con_val)];
(*pStruct[i]).category = new char[strlen(cat_val)];
(*pStruct[i]).removal_date = new char[strlen(rm_val)];
strcpy((*pStruct[i]).creation_date, cr_val);
strcpy((*pStruct[i]).content, con_val);
strcpy((*pStruct[i]).category, cat_val);
strcpy((*pStruct[i]).removal_date, rm_val);
}
for (int i = 0; i < PQntuples(res); i++) {
printf("%s ", (pStruct[i]->creation_date));
printf(" %s ", (pStruct[i]->content));
printf(" %s ", (pStruct[i]->category));
printf(" %s ", (pStruct[i]->removal_date));
printf("\n");
}
PQclear(res);
}
here is how I use it
struct announcementStruct *announcements = NULL;
int size;
db.selectAnnouncements(0, "DOGS", &announcements, &size);
You are definitely forgetting to null terminate the strings. Allocate strlen + 1 to fit the null. Forgetting a null is an easy cause of segfaults. Functions like strncpy snprintf help ensure things are safer.
The comment by Kevin is also correct. Make sure you get all 12 instances of this mistake (the original 4, the four in the strcpy and the 4 in the printf)

c++: Convert 24bpp to 8 bpp or 1bpp image

I have to convert a 24bpp image to a 1bpp image or 8bpp image based on color table. The caller expects a unsigned char* in either case (which would be further processed or maybe for now debug output by sending the BITMAPINFOHEADER.biBitCount to its proper value, 8 or 1).
I have code to extract the color index into the palette (colorIndexArray is from color conversion or dithering algorithms)... I can get the info for an 8bpp bitmap...
But my problem is, I don't know how to put this info into a 1bpp bitmap
typedef struct {
unsigned int size;
unsigned char* pixels;
} ColorIndexArray;
unsigned char* convertImage(const ColorIndexArray& colorIndexArray, unsigned int paletteSize)
{
unsigned char* outputImage;
if (paleteSize > 2)
{
outputImage = (unsigned char*)LocalAlloc(LPTR, colorIndexArray.size);
for (int i=0; i<colorIndexArray.size; i++)
*(outputImage+i) = colorIndexArray.pixels[i];
// this works great
}
else // monochrome, caller has palette colors likely b/w (or purple/magenta or anything), must be 1bpp
{
outputImage = (unsigned char*)LocalAlloc(LPTR, colorIndexArray.size / 8);
// how can i place the unsigned char* info (which is already
// determined based on desired algorithm, representing index in
// color table) into the output image inside a single bit ?
// (obviously its value for a monochrome image would be 0 or 1 but
// it is saved as unsigned char* at the algorithm output)
// And how do I advance the pointer ?
// Will it be type safe ? Aligned to byte ? or do I have to fill
// with something at the end to make multiple of 8 bits ?
}
return outputImage;
}
Trying this after comment suggestion:
#include <GdiPlus.h>
....
else {
Gdiplus::Bitmap monoBitmap(w, h, PixelFormat1bppIndexed);
Gdiplus::BitmapData monoBitmapData;
Gdiplus::Rect rect(0, 0, w, h);
monoBitmap.LockBits(&rect, Gdiplus::ImageLockModeWrite, PixelFormat1bppIndexed, &monoBitmapData);
outputImage = (unsigned char*)monoBitmapData.Scan0;
for (unsigned int y = 0; y < h; y++)
{
for (unsigned int x = 0; x < w; x++)
{
if (colorIndexArray.pixels[x + y * w])
outputImage[y*monoBitmapData.Stride + x / 8] |= (unsigned char)(0x80 >> (x % 8));
}
}
monoBitmap.UnlockBits(&monoBitmapData);
}
return outputImage;
(Also need to allocate the memory for outputImage)
Based on the example suggested by Hans Passant (thank you also for pointing out how important the stride is), I wrote this little conversion
unsigned long stride = (((w + 31) & ~31) >> 3);
outputImage = (unsigned char*)LocalAlloc(LPTR, stride * h);
for (unsigned int y = 0; y < h; y++)
{
unsigned char* b = (unsigned char*)LocalAlloc(LPTR, stride);
for (unsigned int x = 0; x < w; x++)
if (colorIndexArray.pixels[x + y * w])
b[x / 8] |= (unsigned char)(0x80 >> (x % 8));
CopyMemory(outputImage + stride * y, b, stride);
}

Extracting data from a file that was read into memory

I have a binary data file that contains 2d and 3d coordinates in such order:
uint32 numberOfUVvectors;
2Dvec uv[numberOfUVvectors];
uint32 numberOfPositionVectors;
3Dvec position[numberOfPositionVectors];
uint32 numberOfNormalVectors;
3Dvec normal[numberOfNormalVectors];
2Dvec and 3Dvec are structs composed from 2 and 3 floats respectively.
At first, I read all these values using the "usual" way:
in.read(reinterpret_cast<char *>(&num2d), sizeof(uint32));
2Dvectors.reserve(num2d); // It's for an std::vector<2DVec> 2Dvectors();
for (int i = 0; i < num2d; i++){
2Dvec 2Dvector;
in.read(reinterpret_cast<char *>(&2Dvector), sizeof(2DVec));
2Dvectors.push_back(2Dvector);
}
It worked fine, but it was painfully slow (there can be more than 200k entries in a file and with so many read calls, the hdd access became a bottleneck). I decided to read the entire file into a buffer at once:
in.seekg (0, in.end);
int length = in.tellg();
in.seekg (0, in.beg);
char * buffer = new char [length];
is.read (buffer,length);
The reading is way faster now, but here's the question: how to parse that char buffer back into integers and structs?
To answer your specific question:
unsigned char * pbuffer = (unsigned char *)buffer;
uint32 num2d = *((uint32 *)pbuffer);
pbuffer += sizeof(uint32);
if(num2d)
{
2Dvec * p2Dvec = (2Dvec *)pbuffer;
2Dvectors.assign(p2Dvec, p2Dvec + num2d);
pbuffer += (num2d * sizeof(2Dvec));
}
uint32 numpos = *((uint32 *)pbuffer);
pbuffer += sizeof(uint32);
if(numpos)
{
3Dvec * p3Dvec = (3Dvec *)pbuffer;
Pos3Dvectors.assign(p3Dvec, p3Dvec + numpos);
pbuffer += (numpos * sizeof(3Dvec));
}
uint32 numnorm = *((uint32 *)pbuffer);
pbuffer += sizeof(uint32);
if(numnorm)
{
3Dvec * p3Dvec = (3Dvec *)pbuffer;
Normal3Dvectors.assign(p3Dvec, p3Dvec + numnorm);
pbuffer += (numnorm * sizeof(3Dvec));
}
// do not forget to release the allocated buffer
A an even faster way would be:
in.read(reinterpret_cast<char *>(&num2d), sizeof(uint32));
if(num2d)
{
2Dvectors.resize(num2d);
2Dvec * p2Dvec = &2Dvectors[0];
in.read(reinterpret_cast<char *>(&p2Dvec), num2d * sizeof(2Dvec));
}
//repeat for position & normal vectors
Use memcpy with the appropriate sizes and start values
or cast the values (example):
#include <iostream>
void copy_array(void *a, void const *b, std::size_t size, int amount)
{
std::size_t bytes = size * amount;
for (int i = 0; i < bytes; ++i)
reinterpret_cast<char *>(a)[i] = static_cast<char const *>(b)[i];
}
int main()
{
int a[10], b[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
copy_array(a, b, sizeof(b[0]), 10);
for (int i = 0; i < 10; ++i)
std::cout << a[i] << ' ';
}