This is my very first question:
First of these 2 functions you see here below works fine to some extent:
Uint32 AWSprite::get_pixelColor_location(SDL_Surface * surface, int x, int y) {
int bpp = surface->format->BytesPerPixel;
/* Here p is the address to the pixel we want to retrieve */
Uint8 *p = (Uint8 *)surface->pixels + y * surface->pitch + x * bpp;
switch (bpp) {
case 1:
return *p;
case 2:
return *(Uint16 *)p;
case 3:
if (SDL_BYTEORDER == SDL_BIG_ENDIAN)
return p[0] << 16 | p[1] << 8 | p[2];
else
return p[0] | p[1] << 8 | p[2] << 16;
case 4:
return *(Uint32 *)p;
default:
return 0;
}
}
void AWSprite::set_all_frame_image_actual_size() {
/* This function finds an entire rows that has transparency
then stores the amount of rows to a Frame_image_absolute structure
*/
absolute_sprite = new Frame_image_absolute*[howManyFrames];
for (int f = 0; f < howManyFrames; f++) {
SDL_LockSurface(frames[f]);
int top_gap = 0; int bottom_gap = 0;
int per_transparent_px_count = 1;
for (int i = 0; i < frames[f]->h; i++) {
int per_transparent_px_count = 1;
if (this->get_pixelColor_location(frames[f], j, i) == transparentColour) per_transparent_px_count++;
if (per_transparent_px_count >= frames[f]->w) {
if (i < frames[f]->h / 2) {
per_transparent_px_count = 1;
top_gap++;
} else {
per_transparent_px_count = 1;
bottom_gap++;
}
}
}
}
int realHeight = frames[f]->h - (top_gap + bottom_gap);
absolute_sprite[f] = new Frame_image_absolute();
absolute_sprite[f]->offset_y = top_gap;
absolute_sprite[f]->height = realHeight;
}
}
When i ran this i get:
Unhandled exception at 0x00173746 in SE Game.exe: 0xC0000005: Access violation reading location 0x03acc0b8.
When i when through debuging, i found that it crashes at:
When iterators variable f == 31, i == 38, j = 139
And stops at AWSprite::get_pixelColor_location() in the line at " return *(Uint32 *)p;
I found that if i ran it again and go through debugging line by line then i will works sometime and sometime it dont!!! So i mean that "It crash at randomly when f > 30, i, j iterators value"
What is going on...
I cannot comment on the question yet, but here are some questions:
Where does j come from? Based on the get_pixelColor_location function I would assume that you're iterating over the width of the surface. This part seems to be missing from the code you posted.
Did you validate that i and j are within the bounds of your surface?
Also, you don't seem to Unlock the surface.
Running your function seems to work adequately here so I suspect you're reading outside of your buffer with invalid parameters.
Related
I'm a newbie for GPU programming using Cuda toolkit, and I have to write some code offering the functionality as I mentioned in the title.
I'd like to paste the code to show what exactly I want to do.
void CTrtModelWrapper::forward(void **bindings,
unsigned height,
unsigned width,
short channel,
ColorSpaceFmt colorFmt,
PixelDataType pixelType) {
uint16_t *devInRawBuffer_ptr = (uint16_t *) bindings[0];
uint16_t *devOutRawBuffer_ptr = (uint16_t *) bindings[1];
const unsigned short bit = 16;
float *devInputBuffer_ptr = nullptr;
float *devOutputBuffer_ptr = nullptr;
unsigned volume = height * width * channel;
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
common::cudaCheck(cudaMalloc((void **) &devOutputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
unsigned short npos = 0;
switch (pixelType) {
case PixelDataType::PDT_INT8: // high 8bit
npos = bit - 8;
break;
case PixelDataType::PDT_INT10: // high 10bit
npos = bit - 10;
break;
default:
break;
}
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
}
break;
default:
break;
}
void *rtBindings[2] = {devInputBuffer_ptr, devOutputBuffer_ptr};
// forward
this->_forward(rtBindings);
// convert output
unsigned short ef_bit = bit - npos;
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devOutRawBuffer_ptr[i] = clip< uint16_t >((uint16_t) devOutputBuffer_ptr[i],
0,
(uint16_t) pow(2, ef_bit)) << npos;
}
}
break;
default:
break;
}
}
bindings is a pointer to an array, the 1st element in the array is a device pointer that points to a buffer allocated using cudaMalloc on the gpu, each element in the buffer is a 16bit integer.the 2nd one the same, used to store the output data.
height,width,channel,colorFmt(RGB here),pixelType(PDT_INT8, aka 8bit) respective to the image height, width,channel number, colorspace, bits to store one pixel value.
the _forward function requires a pointer to an array, similar to bindings except that each element in the buffer should be a 32bit float number.
so I make some transformation using a loop
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
the >> operation is because the actual 8bit data is stored in the high 8 bit.
SEGMENTATION FAULT occurred at this line of code devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); and i equals 0.
I try to separate this code into several line:
uint16_t value = devInRawBuffer_ptr[i];
float transferd = float(value >> npos);
devInputBuffer_ptr[i] = transferd;
and SEGMENTATION FAULT occurred at this line uint16_t value = devInRawBuffer_ptr[i];
I wonder that is this a valid way to assign value to an allocated gpu memory buffer?
PS: the buffer given in bindings are totally fine. they are from host memory using cudaMemcpy before the call to forward function, but I still paste the code below
nvinfer1::DataType type = nvinfer1::DataType::kHALF;
HostBuffer hostInputBuffer(volume, type);
DeviceBuffer deviceInputBuffer(volume, type);
HostBuffer hostOutputBuffer(volume, type);
DeviceBuffer deviceOutputBuffer(volume, type);
// HxWxC --> WxHxC
auto *hostInputDataBuffer = static_cast<unsigned short *>(hostInputBuffer.data());
for (unsigned w = 0; w < W; ++w) {
for (unsigned h = 0; h < H; ++h) {
for (unsigned c = 0; c < C; ++c) {
hostInputDataBuffer[w * H * C + h * C + c] = (unsigned short )(*(ppm.buffer.get() + h * W * C + w * C + c));
}
}
}
auto ret = cudaMemcpy(deviceInputBuffer.data(), hostInputBuffer.data(), volume * getElementSize(type),
cudaMemcpyHostToDevice);
if (ret != 0) {
std::cout << "CUDA failure: " << ret << std::endl;
return EXIT_FAILURE;
}
void *bindings[2] = {deviceInputBuffer.data(), deviceOutputBuffer.data()};
model->forward(bindings, H, W, C, sbsisr::ColorSpaceFmt::CFMT_RGB, sbsisr::PixelDataType::PDT_INT8);
In CUDA, it's generally not advisable to dereference a device pointer in host code. For example, you are creating a "device pointer" when you use cudaMalloc:
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
From the code you have posted, it's not possible to deduce that for devInRawBuffer_ptr but I'll assume it also is a device pointer.
In that case, to perform this operation:
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
}
You would launch a CUDA kernel, something like this:
// put this function definition at file scope
__global__ void shift_kernel(float *dst, uint16_t *src, size_t sz, unsigned short npos){
for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x, idx < sz; idx += gridDim.x*blockDim.x) dst[idx] = (float)((src[idx]) >> npos);
}
// call it like this in your code:
kernel<<<160, 1024>>>(devInputBuffer_ptr, devInRawBuffer_ptr, volume, npos);
(coded in browser, not tested)
If you'd like to learn more about what's going on here, you may wish to study CUDA. For example, you can get most of the basic concepts here and by studying the CUDA sample code vectorAdd. The grid-stride loop is discussed here.
I have an assignment where image composition is done using SAD. And another task is to use MSE instead of SAD in the code. Im struggling with it so can anyone help me with this? Here is the code for SAD.
find_motion(my_image_comp *ref, my_image_comp *tgt,
int start_row, int start_col, int block_width, int block_height)
/* This function finds the motion vector which best describes the motion
between the `ref' and `tgt' frames, over a specified block in the
`tgt' frame. Specifically, the block in the `tgt' frame commences
at the coordinates given by `start_row' and `start_col' and extends
over `block_width' columns and `block_height' rows. The function finds
the translational offset (the returned vector) which describes the
best matching block of the same size in the `ref' frame, where
the "best match" is interpreted as the one which minimizes the sum of
absolute differences (SAD) metric. */
{
mvector vec, best_vec;
int sad, best_sad=256*block_width*block_height;
for (vec.y=-8; vec.y <= 8; vec.y++)
for (vec.x=-8; vec.x <= 8; vec.x++)
{
int ref_row = start_row-vec.y;
int ref_col = start_col-vec.x;
if ((ref_row < 0) || (ref_col < 0) ||
((ref_row+block_height) > ref->height) ||
((ref_col+block_width) > ref->width))
continue; // Translated block not containe within reference frame
int r, c;
int *rp = ref->buf + ref_row*ref->stride + ref_col;
int *tp = tgt->buf + start_row*tgt->stride + start_col;
for (sad=0, r=block_height; r > 0; r--,
rp+=ref->stride, tp+=tgt->stride)
for (c=0; c < block_width; c++)
{
int diff = tp[c] - rp[c];
sad += (diff < 0)?(-diff):diff;
}
if (sad < best_sad)
{
best_sad = sad;
best_vec = vec;
}
}
return best_vec;
}
I got the answer myself I think.
its,
for (mse = 0, r = block_height; r > 0; r--,
rp+=ref->stride, tp+=tgt->stride)
for (c=0; c < block_width; c++)
{
int diff = tp[c] - rp[c];
temp = (diff*diff) / (block_height*block_width);
mse += temp;
temp = 0;
}
if (mse < best_mse)
{
best_mse = mse;
best_vec = vec;
}
}
return best_vec;
}
I am a C++ newbie.
Context: I found this third-party snippet of code that seems to work, but based on my (very limited) knowledge of C++ I suspect it will cause problems. The snippet is as follows:
int aVariable;
int anInt = 1;
int anotherInt = 2;
int lastInt = 3;
aVariable = CHAIN(anInt, anotherInt, lastInt);
Where CHAIN is defined as follows (this is part of a library):
int CHAIN(){ Map(&CHAIN, MakeProcInstance(&_CHAIN), MAP_IPTR_VPN); }
int _CHAIN(int i, int np, int p){ return ASMAlloc(np, p, &chainproc); }
int keyalloc[16384], kpos, alloc_locked, tmp[4];
int ASMAlloc(int np, int p, alias proc)
{
int v, x;
// if(alloc_locked) return 0 & printf("WARNING: you can declare compound key statements (SEQ, CHAIN, EXEC, TEMPO, AXIS) only inside main() call, and not during an event.\xa");
v = elements(&keyalloc) - kpos - 4;
if(v < np | !np) return 0; // not enough allocation space or no parameters
Map(&v, p); Dim(&v, np); // v = params array
keyalloc[kpos] = np + 4; // size
keyalloc[kpos+1] = &proc; // function
keyalloc[kpos+2] = kpos + 2 + np; // parameters index
while(x < np)
{
keyalloc[kpos+3+x] = v[x];
x = x+1;
}
keyalloc[kpos+3+np] = kpos + 3 | JUMP;
x = ASMFind(kpos);
if(x == kpos) kpos = kpos + np + 4;
return x + 1 | PROC; // skip block size
}
int ASMFind(int x)
{
int i, j, k; while(i < x)
{
k = i + keyalloc[i]; // next
if(keyalloc[i] == keyalloc[x]) // size
if(keyalloc[i+1] == keyalloc[x+1]) // proc
{
j = x-i;
i = i+3;
while(keyalloc[i] == keyalloc[j+i]) i = i+1; // param
if((keyalloc[i] & 0xffff0000) == JUMP) return x-j;
}
i = k;
}
return x;
}
EDIT:
The weird thing is that running
CHAIN(aVariable);
effectively executes
CHAIN(anInt, anotherInt, lastInt);
Somehow. This is what led me to believe that aVariable is, in fact, a pointer.
QUESTION:
Is it correct to store a parametrized function call into an integer variable like so? Does "aVariable" work just as a pointer, or is this likely to corrupt random memory areas?
You're calling a function (through an obfuscated interface), and storing the result in an integer. It might or might not cause problems, depending on how you use the value / what you expect it to mean.
Your example contains too many undefined symbols for the reader to provide any better answer.
Also, I think this is C, not C++ code.
I am currently working to solve Project Euler's Problem #60: http://projecteuler.net/problem=60 (Just in case if you want to try and follow my logic).
The issue is that after I build my code (Which it completes without errors) and then run it, I get the error code "Thread 1: EXC_Bad_Access (Code=1, address=0x7fff55056148)" from the IDE I was using while running it (The IDE's built in debugger I think). More Specifically the error occurs only within my "Combinations" Function. The lines that get highlighted are disabled with "//" comment lines within my combinations function. Thus, currently, my code will run without any errors because all the error-causing lines are disabled as comments. if you de-comment any of those lines or any combination of those lines thereof, the code runs into the same error code listed above.
Personal Comments from Experimentation:
What I found was that any line that has something to do with either ofstream or the integer that I initialized called count causes the error. ofstream kind of makes sense, but even after disabling all lines of code related to ofstream, suddenly the integer count starts creating the error.
Any help would be much appreciated! I am still a beginner with C++, (started about two to three weeks ago.)
#include <iostream>
#include <cmath>
#include <fstream>
using namespace std;
/* double x = 2 , y = 2 , b = 3, s = 2; */
/* int z, c = 1, v = 3000; */
int AllPrimes[3000];
/* int AllCombos[2018257871250650][5]; */ // disabled this line for now.
//Used to be within Combinations; Moved here to circumvent "Bad Access" Error
int FindPrimes();
int TestforPrime(double y);
int Combinations();
int WriteArrayToFile(int *ArrayPointer,int ArrayLength, string FileName, char Append);
int main()
{
cout<<FindPrimes();
cout<<Combinations();
}
int Combinations() {
int i1, i2, i3, i4, i5, /* ai */ bi, ci, di, ei;
int ZeroPointBreaker=0;
//ofstream BufferFlushed ("/Users/Yash/Xcode/Projects/Project Euler Programs/Project Euler Problem 60 (Weird Prime Cocatenate Property Problem)/I:O Files/");
int count=0;
int Buffer[9000000][5];
for (i1=0; i1<2996; i1++) {
count++;
// cout<<"Index 1 Iteration: "<<i1<<" || Count Value: "<<count<<"\n";
bi = i1 + 1;
for (i2=bi; i2<2997; i2++) {
count++;
// cout<<"Index 2 Iteration: "<<i2<<" || Count Value: "<<count<<"\n";
ci = i2+ 1;
for (i3=ci; i3<2998; i3++) {
count++;
di = i3 + 1;
for (i4=di; i4<2999; i4++) {
count++;
ei = i4 + 1;
for (i5=ei; i5<3000; i5++) {
count++;
// Buffer[count][0]=AllPrimes[i1];
// Buffer[count][1]=AllPrimes[i2];
// Buffer[count][2]=AllPrimes[i3];
// Buffer[count][3]=AllPrimes[i4];
// Buffer[count][4]=AllPrimes[i5];
}
}
}
//Flush Here
// count=0;
/* for (int i=0; i<9000000; i++) {
if (Buffer[i][1]==0) {ZeroPointBreaker=i; break;}
} */
// for (int i=0; i<ZeroPointBreaker; i++) {
// BufferFlushed<<Buffer[i][1]<<','<<Buffer[i][2]<<','<<Buffer[i][3]<<','<<Buffer[i][4]<<','<<Buffer[i][5]<<'\n';
// }
}
}
//End of Code Statements
//BufferFlushed.close();
return 0;
}
int FindPrimes() {
cout.precision(0);
AllPrimes[0]=2;
double b = 3, s = 2;
int z, c = 1, v = 3000;
while ( c != v ) {
z = TestforPrime(b);
if ( z == 1 ) {
AllPrimes[c]=b;
c = c + 1;
s = s + b;
if ( c == v ) {
cout<<fixed<<" Prime="<<b<<" Count="<<c<<" "<<"Sum="<<s<<"\n";
int success = WriteArrayToFile(AllPrimes,3000,"/Users/Yash/Xcode/Projects/Project Euler Programs/Project Euler Problem 60 (Weird Prime Cocatenate Property Problem)/I:O Files/AllPrimes.txt",'n');
cout<<"\n Write Success (0=Successful): "<<success<<"\n";
if (success == 0) {return 0;}
else {return 1;}
}
else {
};
}
else {
};
b = b + 2;
}
}
int WriteArrayToFile(int *ArrayPointer,int ArrayLength, string FileName, char Append) {
if (Append == 'y') {
ofstream OutputFile (FileName, ios::app);
for ( unsigned long long i1=0 ; i1 < ArrayLength ; i1++) {
OutputFile<<ArrayPointer[i1]<<"\n";
}
OutputFile.close();
return 0;}
else if (Append == 'n') {
ofstream OutputFile (FileName);
for ( unsigned long long i1=0 ; i1 < ArrayLength ; i1++) {
OutputFile<<ArrayPointer[i1]<<"\n";
}
OutputFile.close();
return 0;}
}
int TestforPrime (double y) {
double x = 2;
while ( x <= y ) {
if ( (( y / x ) - int( y / x )) == 0 ) {
if ( y == x ) {
return 1;
}
else {
return 0;
}
}
x = x + 1;
}
}
This variable:
int Buffer[9000000][5];
takes up 45000000 * 4 Bytes. That's 180MB. You can't fit that on the stack. Use a global variable or dynamic allocation (or, more likely, another solution - I haven't looked at the problem itself, so don't know if your solution is "right").
So when I run my code it executes perfectly, but when I try to run it in Visual Profiler it works the first time, but it seems to want to run the program seven times, and the second time it results in an unspecified launch failure. Why would that happen? My code looks like below, and my error checking tells me the error is occurring with
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
(Probably easiest to find in the code by searching for memcpy11, it'll be the line above)
I can't think of a reason a program would essentially seg fault the second time it's run but not the first, and if I run it multiple times in terminal it's totally fine. Can anyone come up with what might be going on?
Thanks!
void fillin(node *p, node *left, node *rt)
{
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
seqptr lsites;
cudaMalloc((void **) &lsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &lsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy7");
cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy8");
steptr rsteps;
seqptr rsites;
cudaMalloc((void **) &rsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &rsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy9");
cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//call kernel
int block_size = 1;
int n_blocks = chars;
fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars);
cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy10");
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy11");
cudaFree(rsites); cudaFree(rsteps);
cudaFree(lsites); cudaFree(lsteps);
checkCUDAError("free");
}
}
__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){
boolean counted;
aas aa;
long s;
long i, j, k, n;
int idx = blockIdx.x;
//reduce array references; may or may not be useful
__shared__ long ls[3];
__shared__ long rs[3];
__shared__ long qs[3];
counted = false;
k = 0;
//computation from original program, but now need to do manual address calculation
if(idx < max){
for(i = 0; i < 3; i++){
rs[i]=rsite[idx][i];
ls[i]=lsite[idx][i];
}
n = lsteps[idx] + rsteps[idx];
counted = false;
for (i = 0; i <= 5; i++) {
if (k < 3) {
switch (i) {
case 0:
s = ls[0] & rs[0];
break;
case 1:
s = (ls[0] & rs[1]) | (ls[1] & rs[0]);
break;
case 2:
s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]);
break;
case 3:
s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0];
break;
case 4:
s = ls[1] | (ls[2] & rs[2]) | rs[1];
break;
case 5:
s = ls[2] | rs[2];
break;
}
if (counted || s != 0) {
qs[k] = s;
k++;
counted = true;
} else if (!counted)
n += cudaWeight[idx];
}
}
for (i = 0; i <= 1; i++) {
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) {
if (((1L << ((long)aa)) & qs[i]) != 0) {
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
rsteps[idx] = n;
for(i = 0; i < 3; i++)
rsite[idx][i]=qs[i];
}
}
Try to disable all counters in the profile session settings. Also try to remove all files like "temp_compute_profiler_1_1.csv" from your working folder (see profile setting "Working Folder", by default in is the same with the place of your executable).
There is the same error (OpenCL over CUDA): http://www.khronos.org/message_boards/viewtopic.php?t=4324