I've got a program which is reading processes virtual memory and some registers for some data, then making amendments to it.
Here I pass the contents of eax register to my function (this seems to work fine, but I thought it might demonstrate what types of data are being involved)
case EXCEPTION_SINGLE_STEP: // EXCEPTION_SINGLE_STEP = 0x80000004
bl_flag = TRUE;
memset((void *)&context, 0, 0x2CC);
context.ContextFlags = 0x10017;
thread = OpenThread(0x1FFFFF, 0, debug_event.dwThreadId);
GetThreadContext(thread, &context);
context.Eip = context.Eip + 1;
// sub_FD4BF0((HANDLE)(*((DWORD *)(lpThreadParameter))), context.Eax);
StringToHtml((HANDLE)(dwArray[0]), context.Eax);
SetThreadContext(thread, &context);
CloseHandle(thread);
break;
void StringToHtml(HANDLE hProcess, DWORD address)
{
WCHAR buff[0x100];
WCHAR html[0x100];
DWORD oldProt = 0, real = 0;
int len = 0;
VirtualProtectEx(hProcess, (LPVOID)address, 0x200, PAGE_READWRITE, &oldProt);
ReadProcessMemory(hProcess, (LPCVOID)address, (LPVOID)buff, 0x200, &real);
len = wcslen(buff);
int k = 0, j = 0;
wprintf(L"Found out chat string : \"%s\" \n", buff);
for (int pp = 0; pp < 0x100; pp++)
html[pp] = NULL;
while(j < len)
{
if (buff[j] == L'&')
{
if (wcsncmp((const WCHAR *)(buff + j + 1), L"lt;", 3) == 0)
{
//html[k] = L'<';
html[k] = L'<font color="#00FF10">';
k++;
j = j + 4;
continue;
}
I am aware this is an incomplete function snippet. However the issue is arriving at my for loop here.
for (int pp = 0; pp < 0x100; pp++)
If i enter more than 256 characters (I at first thought this would be enough) then it crashes. I have clearly missed something obvious as I tried doing pp < len which I thought would use the buffer size, however, I still get the same crash.
How can I read the total size of the string entered in the chat into the loop and make it iterate over the WHOLE thing. Or at the very least catch this error?
Did you change the size of html and buffer according to the max of your for loop? Maybe that is already the solution.
Related
I'm a newbie for GPU programming using Cuda toolkit, and I have to write some code offering the functionality as I mentioned in the title.
I'd like to paste the code to show what exactly I want to do.
void CTrtModelWrapper::forward(void **bindings,
unsigned height,
unsigned width,
short channel,
ColorSpaceFmt colorFmt,
PixelDataType pixelType) {
uint16_t *devInRawBuffer_ptr = (uint16_t *) bindings[0];
uint16_t *devOutRawBuffer_ptr = (uint16_t *) bindings[1];
const unsigned short bit = 16;
float *devInputBuffer_ptr = nullptr;
float *devOutputBuffer_ptr = nullptr;
unsigned volume = height * width * channel;
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
common::cudaCheck(cudaMalloc((void **) &devOutputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
unsigned short npos = 0;
switch (pixelType) {
case PixelDataType::PDT_INT8: // high 8bit
npos = bit - 8;
break;
case PixelDataType::PDT_INT10: // high 10bit
npos = bit - 10;
break;
default:
break;
}
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
}
break;
default:
break;
}
void *rtBindings[2] = {devInputBuffer_ptr, devOutputBuffer_ptr};
// forward
this->_forward(rtBindings);
// convert output
unsigned short ef_bit = bit - npos;
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devOutRawBuffer_ptr[i] = clip< uint16_t >((uint16_t) devOutputBuffer_ptr[i],
0,
(uint16_t) pow(2, ef_bit)) << npos;
}
}
break;
default:
break;
}
}
bindings is a pointer to an array, the 1st element in the array is a device pointer that points to a buffer allocated using cudaMalloc on the gpu, each element in the buffer is a 16bit integer.the 2nd one the same, used to store the output data.
height,width,channel,colorFmt(RGB here),pixelType(PDT_INT8, aka 8bit) respective to the image height, width,channel number, colorspace, bits to store one pixel value.
the _forward function requires a pointer to an array, similar to bindings except that each element in the buffer should be a 32bit float number.
so I make some transformation using a loop
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
the >> operation is because the actual 8bit data is stored in the high 8 bit.
SEGMENTATION FAULT occurred at this line of code devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); and i equals 0.
I try to separate this code into several line:
uint16_t value = devInRawBuffer_ptr[i];
float transferd = float(value >> npos);
devInputBuffer_ptr[i] = transferd;
and SEGMENTATION FAULT occurred at this line uint16_t value = devInRawBuffer_ptr[i];
I wonder that is this a valid way to assign value to an allocated gpu memory buffer?
PS: the buffer given in bindings are totally fine. they are from host memory using cudaMemcpy before the call to forward function, but I still paste the code below
nvinfer1::DataType type = nvinfer1::DataType::kHALF;
HostBuffer hostInputBuffer(volume, type);
DeviceBuffer deviceInputBuffer(volume, type);
HostBuffer hostOutputBuffer(volume, type);
DeviceBuffer deviceOutputBuffer(volume, type);
// HxWxC --> WxHxC
auto *hostInputDataBuffer = static_cast<unsigned short *>(hostInputBuffer.data());
for (unsigned w = 0; w < W; ++w) {
for (unsigned h = 0; h < H; ++h) {
for (unsigned c = 0; c < C; ++c) {
hostInputDataBuffer[w * H * C + h * C + c] = (unsigned short )(*(ppm.buffer.get() + h * W * C + w * C + c));
}
}
}
auto ret = cudaMemcpy(deviceInputBuffer.data(), hostInputBuffer.data(), volume * getElementSize(type),
cudaMemcpyHostToDevice);
if (ret != 0) {
std::cout << "CUDA failure: " << ret << std::endl;
return EXIT_FAILURE;
}
void *bindings[2] = {deviceInputBuffer.data(), deviceOutputBuffer.data()};
model->forward(bindings, H, W, C, sbsisr::ColorSpaceFmt::CFMT_RGB, sbsisr::PixelDataType::PDT_INT8);
In CUDA, it's generally not advisable to dereference a device pointer in host code. For example, you are creating a "device pointer" when you use cudaMalloc:
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
From the code you have posted, it's not possible to deduce that for devInRawBuffer_ptr but I'll assume it also is a device pointer.
In that case, to perform this operation:
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
}
You would launch a CUDA kernel, something like this:
// put this function definition at file scope
__global__ void shift_kernel(float *dst, uint16_t *src, size_t sz, unsigned short npos){
for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x, idx < sz; idx += gridDim.x*blockDim.x) dst[idx] = (float)((src[idx]) >> npos);
}
// call it like this in your code:
kernel<<<160, 1024>>>(devInputBuffer_ptr, devInRawBuffer_ptr, volume, npos);
(coded in browser, not tested)
If you'd like to learn more about what's going on here, you may wish to study CUDA. For example, you can get most of the basic concepts here and by studying the CUDA sample code vectorAdd. The grid-stride loop is discussed here.
This is my struct
/* wave data block header */
typedef struct wavehdr_tag {
LPSTR lpData; /* pointer to locked data buffer */
DWORD dwBufferLength; /* length of data buffer */
DWORD dwBytesRecorded; /* used for input only */
DWORD_PTR dwUser; /* for client's use */
DWORD dwFlags; /* assorted flags (see defines) */
DWORD dwLoops; /* loop control counter */
struct wavehdr_tag FAR *lpNext; /* reserved for driver */
DWORD_PTR reserved; /* reserved for driver */
} WAVEHDR, *PWAVEHDR, NEAR *NPWAVEHDR, FAR *LPWAVEHDR;
I have this variable WAVEHDR waveHeader;
I record 10 secs from microphone and waveHeader->lpData has my raw recorded data, and waveHeader->dwBytesRecorded is the raw data's length
Now I want to calculate the volume in each second to say which second has highest volume and which one has the lowest.
I know I should sum the absolute values and divide by the number of samples
I used sum += abs(waveHeader->lpData[i]); for i from 0 to length of one secs data, but it doesn't give me a good result
it always gives me the same result for each second, but I am silent in some seconds and speak in some...
I read I have to add samples, not bytes How should I convert waveHeader->lpData[i] to samples?
//len = length of one secs data (waveHeader->dwBytesRecorded/10)
for (int i=0; i<len; i++)
{
sum += abs(waveHeader->lpData[i]);
}
You have the WAVEFORMATEX used for capturing the audio, right? If so, you can modify the following routine to meet your needs:
void ProcessSamples(WAVEHDR* header, WAVEFORMATEX* format)
{
BYTE* pData = (BYTE*)(header->data);
DWORD dwNumSamples = header->dwBytesRecorded / format->nBlockAlign;
// 16-bit stereo, the most common format
if ((format->wBitsPerSample == 16) && (format->nChannels == 2))
{
for (DWORD index = 0; index < dwNumSamples; index++)
{
short left = *(short*)pData; pData+=2;
short right = *(short*)pData; pData+=2;
}
}
else if ((format->wBitsPerSample == 16) && (format->nChannels == 1))
{
for (DWORD index = 0; index < dwNumSamples; index++)
{
short monoSample = *(short*)pData; pData+=2;
}
}
else if ((format->wBitsPerSample == 8) && (format->nChannels == 2))
{
// 8-bit samples are unsigned.
// "128" is the median silent value
// normalize to a "signed" value
for (DWORD index = 0; index < dwNumSamples; index++)
{
signed char left = (*(signed char*)pData) - 128; pData += 1;
signed char right = (*(signed char*)pData) - 128; pData += 1;
}
}
else if ((format->wBitsPerSample == 8) && (format->nChannels == 1))
{
for (DWORD index = 0; index < dwNumSamples; index++)
{
signed char monosample = (*(signed char*)pData) - 128; pData += 1;
}
}
}
I am currently using this approach to copy some byte values over:
for (int i = 0; i < (iLen + 1); i++)
{
*(pBuffer + i) = Image.pVid[i];
}
I would like to ask if there is a way to copy these values over in one go, perhaps by using memcopy to gain more speed.
The entire code is:
extern "C" __declspec(dllexport) int __stdcall GetCameraImage(BYTE pBuffer[], int Type, int uWidth, int uHeight)
{
CameraImage Image;
int ret;
Image.pVid = (unsigned int*)malloc(4 * uWidth*uHeight);
ret = stGetCameraImage(&Image, 1, uWidth, uHeight);
if (ret == ERR_SUCCESS)
{
int iLen = (4 * uWidth * uHeight);
for (int i = 0; i < (iLen + 1); i++)
{
*(pBuffer + i) = Image.pVid[i];
}
////print(“ImageType = %d, width = %d, height = %d”, Image.Type, Image.Width,
//// Image.Height);
////print(“First Pixel : B = %d, G = %d, R = %d”, Image.pVid[0], Image.pVid[1],
//// Image.pVid[2]);
////print(“Second Pixel : B = %d, G = %d, R = %d”, Image.pVid[4], Image.pVid[5],
//// Image.pVid[6]);
}
free(Image.pVid);
return ret;
}
Edit:
*pVid is this:
unsigned int *pVid; // pointer to image data (Format RGB32...)
The way your code is currently written, each assignment in your loop will overflow and give you some garbage value in pBuffer because you're trying to assign an unsigned int to a BYTE. On top of that, you will run off the end of the Image.pVid array because i is counting bytes, not unsigned ints
You could fix your code by doing this:
*(pBuffer + i) = ((BYTE*)Image.pVid)[i];
But that is pretty inefficient. Better to move whole words at a time, or you could just use memcpy instead:
memcpy(pBuffer,Image.pVid,iLen) //pBuffer must be at least iLen bytes long
std::wstring hashStr(L"4727b105cf792b2d8ad20424ed83658c");
//....
byte digest[16];
How can I get my md5 hash in digest?
My answer is:
wchar_t * EndPtr;
for (int i = 0; i < 16; i++) {
std::wstring bt = hashStr.substr(i*2, 2);
digest[i] = static_cast<BYTE>(wcstoul(bt.c_str(), &EndPtr, 16));
}
You need to read two characters from hashStr, convert them from hex to a binary value, and put that value into the next spot in digest -- something on this order:
for (int i=0; i<16; i++) {
std::wstring byte = hashStr.substr(i*2, 2);
digest[i] = hextobin(byte);
}
C-way (I didn't test it, but it should work (though I could've screwed up somewhere) and you will get the method anyway).
memset(digest, 0, sizeof(digest));
for (int i = 0; i < 32; i++)
{
wchar_t numwc = hashStr[i];
BYTE numbt;
if (numwc >= L'0' && numwc <= L'9') //I assume that the string is right (i.e.: no SGJSGH chars and stuff) and is in uppercase (you can change that though)
{
numbt = (BYTE)(numwc - L'0');
}
else
{
numbt = 0xA + (BYTE)(numwc - L'A');
}
digest[i/2] += numbt*(2<<(4*((i+1)%2)));
}
So when I run my code it executes perfectly, but when I try to run it in Visual Profiler it works the first time, but it seems to want to run the program seven times, and the second time it results in an unspecified launch failure. Why would that happen? My code looks like below, and my error checking tells me the error is occurring with
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
(Probably easiest to find in the code by searching for memcpy11, it'll be the line above)
I can't think of a reason a program would essentially seg fault the second time it's run but not the first, and if I run it multiple times in terminal it's totally fine. Can anyone come up with what might be going on?
Thanks!
void fillin(node *p, node *left, node *rt)
{
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
seqptr lsites;
cudaMalloc((void **) &lsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &lsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy7");
cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy8");
steptr rsteps;
seqptr rsites;
cudaMalloc((void **) &rsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &rsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy9");
cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//call kernel
int block_size = 1;
int n_blocks = chars;
fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars);
cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy10");
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy11");
cudaFree(rsites); cudaFree(rsteps);
cudaFree(lsites); cudaFree(lsteps);
checkCUDAError("free");
}
}
__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){
boolean counted;
aas aa;
long s;
long i, j, k, n;
int idx = blockIdx.x;
//reduce array references; may or may not be useful
__shared__ long ls[3];
__shared__ long rs[3];
__shared__ long qs[3];
counted = false;
k = 0;
//computation from original program, but now need to do manual address calculation
if(idx < max){
for(i = 0; i < 3; i++){
rs[i]=rsite[idx][i];
ls[i]=lsite[idx][i];
}
n = lsteps[idx] + rsteps[idx];
counted = false;
for (i = 0; i <= 5; i++) {
if (k < 3) {
switch (i) {
case 0:
s = ls[0] & rs[0];
break;
case 1:
s = (ls[0] & rs[1]) | (ls[1] & rs[0]);
break;
case 2:
s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]);
break;
case 3:
s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0];
break;
case 4:
s = ls[1] | (ls[2] & rs[2]) | rs[1];
break;
case 5:
s = ls[2] | rs[2];
break;
}
if (counted || s != 0) {
qs[k] = s;
k++;
counted = true;
} else if (!counted)
n += cudaWeight[idx];
}
}
for (i = 0; i <= 1; i++) {
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) {
if (((1L << ((long)aa)) & qs[i]) != 0) {
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
rsteps[idx] = n;
for(i = 0; i < 3; i++)
rsite[idx][i]=qs[i];
}
}
Try to disable all counters in the profile session settings. Also try to remove all files like "temp_compute_profiler_1_1.csv" from your working folder (see profile setting "Working Folder", by default in is the same with the place of your executable).
There is the same error (OpenCL over CUDA): http://www.khronos.org/message_boards/viewtopic.php?t=4324